diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index 62d4bb600754..53635198120e 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -509,6 +509,23 @@ func sortTargetCandidateSetAndPick( } slices.SortFunc(cands.candidates, func(a, b candidateInfo) int { if diversityScoresAlmostEqual(a.diversityScore, b.diversityScore) { + // Note: Consider the case where the current leaseholder's LPI is + // 3 (lower is better) and we have the following candidates: + // - LPI=1 SLS=normal + // - LPI=2 SLS=low + // Currently we consider the low-SLS candidate first. This is in + // contrast to the single-metric allocator, which only considers + // candidates in the lowest-SLS class (i.e. wouldn't even consider + // the low-SLS candidate since we have a candidate at LPI=1). If we + // make the corresponding change in candidateToMoveLease, we would + // match the single-metric allocator's behavior, but it's unclear + // that that would be better. A good middle ground could be sorting + // here by LPI first, then SLS. That should result in mma preferring + // improving the lease preference, but if that is not possible, it + // would settle for not making it worse (than the current + // leaseholder), which the single-metric allocator won't. + // + // TODO(tbg): consider changing this to sort by LPI first, then SLS. return cmp.Or(cmp.Compare(a.sls, b.sls), cmp.Compare(a.leasePreferenceIndex, b.leasePreferenceIndex), cmp.Compare(a.StoreID, b.StoreID)) @@ -537,6 +554,9 @@ func sortTargetCandidateSetAndPick( } } // Diversity is the same. Include if not reaching disk capacity. + // TODO(tbg): remove highDiskSpaceUtilization check here. These candidates + // should instead be filtered out by retainReadyLeaseTargetStoresOnly (which + // filters down the initial candidate set before computing the mean). if !cand.highDiskSpaceUtilization { cands.candidates[j] = cand j++ @@ -815,10 +835,10 @@ func (cs *clusterState) ensureAnalyzedConstraints(rstate *rangeState) { // - Need diversity change for each candidate. // // The first 3 bullets are encapsulated in the helper function -// computeCandidatesForRange. It works for both replica additions and +// computeCandidatesForReplicaTransfer. It works for both replica additions and // rebalancing. // -// For the last bullet (diversity), the caller of computeCandidatesForRange +// For the last bullet (diversity), the caller of computeCandidatesForReplicaTransfer // needs to populate candidateInfo.diversityScore for each candidate in // candidateSet. It does so via diversityScoringMemo. Then the (loadSummary, // diversityScore) pair can be used to order candidates for attempts to add. @@ -846,41 +866,125 @@ func (cs *clusterState) ensureAnalyzedConstraints(rstate *rangeState) { // loadSheddingStore is only specified if this candidate computation is // happening because of overload. -func (cs *clusterState) computeCandidatesForRange( +// +// postMeansExclusions are filtered post-means: their load is included in the +// mean (they're viable locations in principle) but they're not candidates for +// this specific transfer (the classic case: already have a replica). +func (cs *clusterState) computeCandidatesForReplicaTransfer( ctx context.Context, - expr constraintsDisj, - storesToExclude storeSet, + conj constraintsConj, + existingReplicas storeSet, + postMeansExclusions storeSet, loadSheddingStore roachpb.StoreID, passObs *rebalancingPassMetricsAndLogger, ) (_ candidateSet, sheddingSLS storeLoadSummary) { - means := cs.meansMemo.getMeans(expr) - if loadSheddingStore > 0 { - sheddingSS := cs.stores[loadSheddingStore] - sheddingSLS = cs.meansMemo.getStoreLoadSummary(ctx, means, loadSheddingStore, sheddingSS.loadSeqNum) - if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange { - // In this set of stores, this store no longer looks overloaded. - passObs.replicaShed(notOverloaded) - return candidateSet{}, sheddingSLS - } + // Start with computing the stores (and corresponding means) that satisfy + // the constraint expression. If we don't see a need to filter out any of + // these stores before computing the means, we can use it verbatim, otherwise + // we will recompute the means again below. + cs.scratchDisj[0] = conj + means := cs.meansMemo.getMeans(cs.scratchDisj[:1]) + + // Pre-means filtering: copy to scratch, then filter in place. + // Filter out stores that have a non-OK replica disposition. + cs.scratchStoreSet = append(cs.scratchStoreSet[:0], means.stores...) + filteredStores := retainReadyReplicaTargetStoresOnly(ctx, cs.scratchStoreSet, cs.stores, existingReplicas) + + // Determine which means to use. + // + // TODO(tbg): unit testing. + var effectiveMeans *meansLoad + if len(filteredStores) == len(means.stores) { + // Common case: nothing was filtered, use cached means. + effectiveMeans = &means.meansLoad + } else if len(filteredStores) == 0 { + // No viable candidates at all. + return candidateSet{}, sheddingSLS + } else { + // Some stores were filtered; recompute means over filtered set. + cs.scratchMeans = computeMeansForStoreSet( + cs, filteredStores, cs.meansMemo.scratchNodes, cs.meansMemo.scratchStores) + effectiveMeans = &cs.scratchMeans + log.KvDistribution.VEventf(ctx, 2, + "pre-means filtered %d stores → remaining %v, means: store=%v node=%v", + len(means.stores)-len(filteredStores), filteredStores, + effectiveMeans.storeLoad, effectiveMeans.nodeLoad) } - // We only filter out stores that are not fdOK. The rest of the filtering - // happens later. + + sheddingSLS = cs.computeLoadSummary(ctx, loadSheddingStore, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad) + if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange { + // In this set of stores, this store no longer looks overloaded. + passObs.replicaShed(notOverloaded) + return candidateSet{}, sheddingSLS + } + var cset candidateSet - for _, storeID := range means.stores { - if storesToExclude.contains(storeID) { + for _, storeID := range filteredStores { + if postMeansExclusions.contains(storeID) { + // This store's load is included in the mean, but it's not a viable + // target for this specific transfer (e.g. it already has a replica). continue } - ss := cs.stores[storeID] - csls := cs.meansMemo.getStoreLoadSummary(ctx, means, storeID, ss.loadSeqNum) + csls := cs.computeLoadSummary(ctx, storeID, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad) cset.candidates = append(cset.candidates, candidateInfo{ StoreID: storeID, storeLoadSummary: csls, }) } - cset.means = &means.meansLoad + cset.means = effectiveMeans return cset, sheddingSLS } +// retainReadyReplicaTargetStoresOnly filters the input set to only those stores +// that are ready to accept a replica. A store is not ready if it has a non-OK +// replica disposition. In practice, the input set is already filtered by +// constraints. +// +// Stores already housing a replica (on top of being in the input storeSet) +// bypass this disposition check since they already have the replica - its load +// should be in the mean regardless of its disposition, as we'll pick candidates +// based on improving clustering around the mean. +// +// The input storeSet is mutated (and returned as the result). +func retainReadyReplicaTargetStoresOnly( + ctx context.Context, + in storeSet, + stores map[roachpb.StoreID]*storeState, + existingReplicas storeSet, +) storeSet { + out := in[:0] + for _, storeID := range in { + if existingReplicas.contains(storeID) { + // Stores on existing replicas already have the load and we want to + // include them in the mean, even if they are not accepting new replicas + // or even try to shed. + // + // TODO(tbg): health might play into this, though. For example, when + // a store is dead, whatever load we have from it is stale and we + // are better off not including it. For now, we ignore this problem + // because the mma only handles rebalancing, whereas a replica on a + // dead store would be removed by the single-metric allocator after + // the TimeUntilStoreDead and so would disappear from our view. + out = append(out, storeID) + continue + } + ss := stores[storeID] + switch { + case ss.status.Disposition.Replica != ReplicaDispositionOK: + log.KvDistribution.VEventf(ctx, 2, "skipping s%d for replica transfer: replica disposition %v (health %v)", storeID, ss.status.Disposition.Replica, ss.status.Health) + case highDiskSpaceUtilization(ss.reportedLoad[ByteSize], ss.capacity[ByteSize]): + // TODO(tbg): remove this from mma and just let the caller set this + // disposition based on the following cluster settings: + // - kv.allocator.max_disk_utilization_threshold + // - kv.allocator.rebalance_to_max_disk_utilization_threshold + log.KvDistribution.VEventf(ctx, 2, "skipping s%d for replica transfer: high disk utilization (health %v)", storeID, ss.status.Health) + default: + out = append(out, storeID) + } + } + return out +} + // Diversity scoring is very amenable to caching, since the set of unique // locality tiers for range replicas is likely to be small. And the cache does // not need to be cleared after every allocator pass. This caching is done via diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index 7cf3dcca3216..0426082d9beb 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -1268,6 +1268,9 @@ type clusterState struct { ranges map[roachpb.RangeID]*rangeState scratchRangeMap map[roachpb.RangeID]struct{} + scratchStoreSet storeSet // scratch space for pre-means filtering + scratchMeans meansLoad // scratch space for recomputed means + scratchDisj [1]constraintsConj // scratch space for getMeans call // Added to when a change is proposed. Will also add to corresponding // rangeState.pendingChanges and to the affected storeStates. @@ -1623,6 +1626,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( topk.dim = WriteBandwidth } if sls.highDiskSpaceUtilization { + // If disk space is running out, shedding bytes becomes the top priority. topk.dim = ByteSize } else if sls.sls > loadNoChange { // If multiple dimensions are contributing the same loadSummary, we will pick @@ -2191,15 +2195,42 @@ func (cs *clusterState) getNodeReportedLoad(nodeID roachpb.NodeID) *NodeLoad { return nil } -// canShedAndAddLoad returns true if the delta can be added to the target -// store and removed from the src store, such that the relative load summaries -// will not get worse. +// canShedAndAddLoad returns true if the delta can be added to the target store +// and removed from the src store. It does not change any state between the call +// and return. // -// It does not change any state between the call and return. +// overloadDim represents the dimension that is overloaded in the source and the +// function requires that along that dimension, the target is < loadNoChange and +// the source is > loadNoChange. // -// overloadDim represents the dimension that is overloaded in the source and -// the function requires that the target must be currently < loadNoChange -// along that dimension. +// Broadly speaking, the method tries to ascertain that the target wouldn't be +// worse off than the source following the transfer. To do this, the method +// looks at a load summary for the target that would result from the load +// transfer (targetLoadSummary). +// +// When onlyConsiderTargetCPUSummary is true, the targetLoadSummary derives from +// the target's post-transfer CPU dimension only. This is appropriate when a lease is +// transferred, as this should only affect the CPU dimension, and we don't want +// lease transfers to be subject to stricter checks related to other dimensions. +// When onlyConsiderTargetCPUSummary is false, targetLoadSummary is the target's +// worst post-transfer load summary. In both cases, the node load summary is also +// considered. +// +// TODO(tbg): understand and explain why the node load summary is in the mix here. +// +// In either case, if the targetLoadSummary is < loadNoChange, the change is +// permitted right away. Otherwise, stricter checks apply: After the transfer, +// - the target must not be overloadUrgent, +// - the target has no pending changes (to delay making a potentially non-ideal +// choice of the target), +// - the target's overloaded dimension's summary must not be worse than the +// source's ("overloadedDimPermitsChange"), +// - along each of the other (!=overloadeDim) dimensions, the percentage +// increase in load is at most a third of that of the overloaded dimension. +// (e.g. if CPU goes up by 30%, WriteBandwidth can go up by at most 10%). +// - the target's node load summary must not be worse than the target's store +// load summary. See inline comment for more details. + func (cs *clusterState) canShedAndAddLoad( ctx context.Context, srcSS *storeState, @@ -2216,6 +2247,11 @@ func (cs *clusterState) canShedAndAddLoad( // the load delta addition flips the loadSummary for either the target or the // source, which suggests it might be useful to add this to verbose logging. + // Compute srcSLS and targetSLS, which are the load summaries of the source + // and target that would result from moving the lease. + // + // TODO(tbg): extract this into a helper and set it up so that it doesn't + // temporarily modify the cluster state. targetNS := cs.nodes[targetSS.NodeID] // Add the delta. deltaToAdd := loadVectorToAdd(delta) @@ -2254,28 +2290,16 @@ func (cs *clusterState) canShedAndAddLoad( reason.WriteString("targetSLS.highDiskSpaceUtilization") return false } - // We define targetSummary as a summarization across all dimensions of the - // target. A targetSummary < loadNoChange always accepts the change. When - // the targetSummary >= loadNoChange, we are stricter and require both that - // there are no pending changes in the target, and the target is "not worse" - // in a way that will cause thrashing, where the details are defined below. - // The no pending changes requirement is to delay making a potentially - // non-ideal choice of the target. - // - // NB: The target's overload dimension summary must have been < - // loadNoChange, and the source must have been > loadNoChange. + + // We define targetSummary as a "worst" of the considered load dimesions + // (only CPU, or all). var targetSummary loadSummary if onlyConsiderTargetCPUSummary { targetSummary = targetSLS.dimSummary[CPURate] - if targetSummary < targetSLS.nls { - targetSummary = targetSLS.nls - } } else { targetSummary = targetSLS.sls - if targetSummary < targetSLS.nls { - targetSummary = targetSLS.nls - } } + targetSummary = max(targetSummary, targetSLS.nls) if targetSummary < loadNoChange { return true @@ -2284,6 +2308,7 @@ func (cs *clusterState) canShedAndAddLoad( reason.WriteString("overloadUrgent") return false } + // Need to consider additional factors. // // It is possible that both are overloadSlow in aggregate. We want to make @@ -2312,7 +2337,7 @@ func (cs *clusterState) canShedAndAddLoad( // That boolean predicate can also be too strict, in that we should permit // transitions to overloadSlow along one dimension, to allow for an // exchange. - overloadedDimFractionIncrease := math.MaxFloat64 + var overloadedDimFractionIncrease float64 if targetSS.adjusted.load[overloadedDim] > 0 { overloadedDimFractionIncrease = float64(deltaToAdd[overloadedDim]) / float64(targetSS.adjusted.load[overloadedDim]) @@ -2353,14 +2378,33 @@ func (cs *clusterState) canShedAndAddLoad( targetSLS.maxFractionPendingIncrease < epsilon && targetSLS.maxFractionPendingDecrease < epsilon && // NB: targetSLS.nls <= targetSLS.sls is not a typo, in that we are - // comparing targetSLS with itself. The nls only captures node-level - // CPU, so if a store that is overloaded wrt WriteBandwidth wants to - // shed to a store that is overloaded wrt CPURate, we need to permit - // that. However, the nls of the former will be less than the that of - // the latter. By looking at the nls of the target here, we are making - // sure that it is no worse than the sls of the target, since if it - // is, the node is overloaded wrt CPU due to some other store on that - // node, and we should be shedding that load first. + // comparing targetSLS with itself. + // + // Consider a node that has two stores: + // - s1 is low on CPU + // - s2 is very high on CPU, resulting in a node load summary of + // overloadSlow or overloadUrgent) + // + // In this code path, targetSLS is >= loadNoChange, so there must be + // some overload dimension in targetSLS. If it comes from write bandwidth + // (or any other non-CPU dimension), without this check,s1 might be + // considered an acceptable target for adding CPU load. But it is clearly + // not a good target, since the node housing s1 is CPU overloaded - s2 + // should be shedding CPU load first. + // This example motivates the condition below. If we reach this code, + // we know that targetSLS >= loadNoChange, and we decide: + // - at sls=loadNoChange, we require nls <= loadNoChange + // - at sls=overloadSlow, we require nls <= overloadSlow + // - at sls=overloadUrgent, we require nls <= overloadUrgent. + // In other words, whenever a node level summary was "bumped up" beyond + // the target's by some other local store, we reject the change. + // + // TODO(tbg): While the example illustrates that "something had to be + // done", I don't understand why it makes sense to solve this exactly + // as it was done. The node level summary is based on node-wide CPU + // utilization as well as its distance from the mean (across the + // candidate set). Store summaries a) reflect the worst dimension, and + // b) on the CPU dimension are based on the store-apportioned capacity. targetSLS.nls <= targetSLS.sls if canAddLoad { return true @@ -2451,10 +2495,11 @@ func computeLoadSummary( } nls := loadSummaryForDimension(ctx, storeIDForLogging, ns.NodeID, CPURate, ns.adjustedCPU, ns.CapacityCPU, mnl.loadCPU, mnl.utilCPU) return storeLoadSummary{ - worstDim: worstDim, - sls: sls, - nls: nls, - dimSummary: dimSummary, + worstDim: worstDim, + sls: sls, + nls: nls, + dimSummary: dimSummary, + // TODO(tbg): remove highDiskSpaceUtilization. highDiskSpaceUtilization: highDiskSpaceUtil, maxFractionPendingIncrease: ss.maxFractionPendingIncrease, maxFractionPendingDecrease: ss.maxFractionPendingDecrease, diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go index c51505ee21f6..7dec874fbafd 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go @@ -76,12 +76,13 @@ type rebalanceEnv struct { // pass. Can be nil. passObs *rebalancingPassMetricsAndLogger // Scratch variables reused across iterations. + // TODO(tbg): these are a potential source of errors (imagine two nested + // calls using the same scratch variable). Just make a global variable + // that wraps a bunch of sync.Pools for the types we need. scratch struct { - disj [1]constraintsConj - storesToExclude storeSet - storesToExcludeForRange storeSet - nodes map[roachpb.NodeID]*NodeLoad - stores map[roachpb.StoreID]struct{} + postMeansExclusions storeSet + nodes map[roachpb.NodeID]*NodeLoad + stores map[roachpb.StoreID]struct{} } } @@ -429,22 +430,11 @@ func (re *rebalanceEnv) rebalanceReplicas( localStoreID roachpb.StoreID, ignoreLevel ignoreLevel, ) { - // If the node is cpu overloaded, or the store/node is not fdOK, exclude - // the other stores on this node from receiving replicas shed by this - // store. - excludeStoresOnNode := store.nls > overloadSlow - re.scratch.storesToExclude = re.scratch.storesToExclude[:0] - if excludeStoresOnNode { - nodeID := ss.NodeID - for _, storeID := range re.nodes[nodeID].stores { - re.scratch.storesToExclude.insert(storeID) - } - log.KvDistribution.VEventf(ctx, 2, "excluding all stores on n%d due to overload/fd status", nodeID) - } else { - // This store is excluded of course. - re.scratch.storesToExclude.insert(store.StoreID) + if store.StoreID != localStoreID && store.dimSummary[CPURate] >= overloadSlow && + re.now.Sub(ss.overloadStartTime) < remoteStoreLeaseSheddingGraceDuration { + log.KvDistribution.VEventf(ctx, 2, "skipping remote store s%d: in lease shedding grace period", store.StoreID) + return } - // Iterate over top-K ranges first and try to move them. topKRanges := ss.adjusted.topKRanges[localStoreID] n := topKRanges.len() @@ -498,6 +488,8 @@ func (re *rebalanceEnv) rebalanceReplicas( "rstate_replicas=%v rstate_constraints=%v", store.StoreID, rangeID, rstate.pendingChanges, rstate.replicas, rstate.constraints)) } + // Get the constraint conjunction which will allow us to look up stores + // that could replace the shedding store. var conj constraintsConj var err error if isVoter { @@ -512,32 +504,43 @@ func (re *rebalanceEnv) rebalanceReplicas( log.KvDistribution.VEventf(ctx, 2, "skipping r%d: constraint violation needs fixing first: %v", rangeID, err) continue } - re.scratch.disj[0] = conj - re.scratch.storesToExcludeForRange = append(re.scratch.storesToExcludeForRange[:0], re.scratch.storesToExclude...) - // Also exclude all stores on nodes that have existing replicas. + // Build post-means exclusions: stores whose load is included in the mean + // (they're viable locations in principle) but aren't valid targets for + // this specific transfer. + // + // NB: to prevent placing replicas on multiple CRDB nodes sharing a + // host, we'd need to make changes here. + // See: https://github.com/cockroachdb/cockroach/issues/153863 + re.scratch.postMeansExclusions = re.scratch.postMeansExclusions[:0] + existingReplicas := storeSet{} // TODO(tbg): avoid allocation for _, replica := range rstate.replicas { storeID := replica.StoreID + existingReplicas.insert(storeID) if storeID == store.StoreID { - // We don't exclude other stores on this node, since we are allowed to - // transfer the range to them. If the node is overloaded or not fdOK, - // we have already excluded those stores above. + // Exclude the shedding store (we're moving away from it), but not + // other stores on its node (within-node rebalance is allowed). + re.scratch.postMeansExclusions.insert(storeID) continue } + // Exclude all stores on nodes with other existing replicas. nodeID := re.stores[storeID].NodeID for _, storeID := range re.nodes[nodeID].stores { - re.scratch.storesToExcludeForRange.insert(storeID) + re.scratch.postMeansExclusions.insert(storeID) } } + + // Compute the candidates. These are already filtered down to only those stores + // that we'll actually be happy to transfer the range to. + // Note that existingReplicas is a subset of postMeansExclusions, so they'll + // be included in the mean, but are never considered as candidates. + // // TODO(sumeer): eliminate cands allocations by passing a scratch slice. - cands, ssSLS := re.computeCandidatesForRange( - ctx, re.scratch.disj[:], re.scratch.storesToExcludeForRange, store.StoreID, re.passObs) + cands, ssSLS := re.computeCandidatesForReplicaTransfer(ctx, conj, existingReplicas, re.scratch.postMeansExclusions, store.StoreID, re.passObs) log.KvDistribution.VEventf(ctx, 2, "considering replica-transfer r%v from s%v: store load %v", rangeID, store.StoreID, ss.adjusted.load) - if log.V(2) { - log.KvDistribution.Infof(ctx, "candidates are:") - for _, c := range cands.candidates { - log.KvDistribution.Infof(ctx, " s%d: %s", c.StoreID, c.storeLoadSummary) - } + log.KvDistribution.VEventf(ctx, 3, "candidates are:") + for _, c := range cands.candidates { + log.KvDistribution.VEventf(ctx, 3, " s%d: %s", c.StoreID, c.storeLoadSummary) } if len(cands.candidates) == 0 { @@ -700,23 +703,47 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( "store=%v range_id=%v should be leaseholder but isn't", store.StoreID, rangeID)) } + + // Get the stores from the replica set that are at least as good as the + // current leaseholder wrt satisfaction of lease preferences. This means + // that mma will never make lease preferences violations worse when + // moving the lease. + // + // Example: + // s1 and s2 in us-east, s3 in us-central, lease preference for us-east. + // - if s3 has the lease: candsPL = [s1, s2, s3] + // - if s1 has the lease: candsPL = [s2, s1] (s3 filtered out) + // - if s2 has the lease: candsPL = [s1, s2] (s3 filtered out) + // + // In effect, we interpret each replica whose store is worse than the current + // leaseholder as ill-disposed for the lease and (pre-means) filter then out. cands, _ := rstate.constraints.candidatesToMoveLease() - var candsPL storeSet + // candsPL is the set of stores to consider the mean. This should + // include the current leaseholder, so we add it in, but only in a + // little while. + var candsPL storeSet // TODO(tbg): avoid allocation for _, cand := range cands { candsPL.insert(cand.storeID) } - // Always consider the local store (which already holds the lease) as a - // candidate, so that we don't move the lease away if keeping it would be - // the better option overall. - // TODO(tbg): is this really needed? We intentionally exclude the leaseholder - // in candidatesToMoveLease, so why reinsert it now? - candsPL.insert(store.StoreID) - if len(candsPL) <= 1 { + if len(candsPL) == 0 { + // No candidates to move the lease to. We bail early to avoid some + // logging below that is not helpful if we didn't have any real + // candidates to begin with. re.passObs.leaseShed(noCandidate) - continue // leaseholder is the only candidate + continue } + // NB: intentionally log before re-adding the current leaseholder so + // we don't list it as a candidate. + log.KvDistribution.VEventf(ctx, 2, "considering lease-transfer r%v from s%v: candidates are %v", rangeID, store.StoreID, candsPL) + // Now candsPL is ready for computing the means. + candsPL.insert(store.StoreID) + + // Filter by disposition. Note that we pass the shedding store in to + // make sure that its disposition does not matter. In other words, the + // leaseholder is always going to include itself in the mean, even if it + // is ill-disposed towards leases. + candsPL = retainReadyLeaseTargetStoresOnly(ctx, candsPL, re.stores, rangeID, store.StoreID) - candsPL = retainReadyLeaseTargetStoresOnly(ctx, candsPL, re.stores, rangeID) // INVARIANT: candsPL - {store.StoreID} \subset cands if len(candsPL) == 0 || (len(candsPL) == 1 && candsPL[0] == store.StoreID) { re.passObs.leaseShed(noHealthyCandidate) @@ -729,9 +756,10 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( // which is also in cands. clear(re.scratch.nodes) + // NB: candsPL is not empty - it includes at least the current leaseholder + // and one additional candidate. means := computeMeansForStoreSet(re, candsPL, re.scratch.nodes, re.scratch.stores) sls := re.computeLoadSummary(ctx, store.StoreID, &means.storeLoad, &means.nodeLoad) - log.KvDistribution.VEventf(ctx, 2, "considering lease-transfer r%v from s%v: candidates are %v", rangeID, store.StoreID, candsPL) if sls.dimSummary[CPURate] < overloadSlow { // This store is not cpu overloaded relative to these candidates for // this range. @@ -741,6 +769,9 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( } var candsSet candidateSet for _, cand := range cands { + if cand.storeID == store.StoreID { + panic(errors.AssertionFailedf("current leaseholder can't be a candidate: %v", cand)) + } if !candsPL.contains(cand.storeID) { // Skip candidates that are filtered out by // retainReadyLeaseTargetStoresOnly. @@ -822,10 +853,30 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( // // The input storeSet is mutated (and used to for the returned result). func retainReadyLeaseTargetStoresOnly( - ctx context.Context, in storeSet, stores map[roachpb.StoreID]*storeState, rangeID roachpb.RangeID, + ctx context.Context, + in storeSet, + stores map[roachpb.StoreID]*storeState, + rangeID roachpb.RangeID, + existingLeaseholder roachpb.StoreID, ) storeSet { out := in[:0] for _, storeID := range in { + if storeID == existingLeaseholder { + // The existing leaseholder is always included in the mean, even if + // it is ill-disposed towards leases. Because it is holding the lease, + // we know that its load is recent. + // + // Example: Consider a range with leaseholder on s1 and voters on s2 + // and s3. All stores have CPU capacity of 100 units. s1 has load 40, + // s2 has load 80, s3 has load 80. The mean CPU utilization (total + // load / total capacity) is (40+80+80)/(100+100+100) = 66% if we + // include s1 and (80+80)/(100+100) = 80% if we don't. + // If we filtered out s1 just because it is ill-disposed towards + // leases, s2 and s3 would be exactly on the mean and we might + // consider transferring the lease to them, but we should not. + out = append(out, storeID) + continue + } s := stores[storeID].status switch { case s.Disposition.Lease != LeaseDispositionOK: diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go index e25af0004929..92c7db61b480 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go @@ -419,6 +419,11 @@ func TestClusterState(t *testing.T) { ctx, finishAndGet := tracing.ContextWithRecordingSpan( context.Background(), tr, d.Cmd, ) + if d.HasArg("breakpoint") { + // You can set a debugger breakpoint here and use `breakpoint=true` + // in a datadriven command to hit it. + t.Log("hit breakpoint") + } switch d.Cmd { case "include": loc := dd.ScanArg[string](t, d, "path") @@ -630,7 +635,21 @@ func TestClusterState(t *testing.T) { case "retain-ready-lease-target-stores-only": in := dd.ScanArg[[]roachpb.StoreID](t, d, "in") rangeID := dd.ScanArg[roachpb.RangeID](t, d, "range-id") - out := retainReadyLeaseTargetStoresOnly(ctx, storeSet(in), cs.stores, rangeID) + lh, _ := dd.ScanArgOpt[roachpb.StoreID](t, d, "leaseholder") + out := retainReadyLeaseTargetStoresOnly(ctx, storeSet(in), cs.stores, rangeID, lh) + rec := finishAndGet() + var sb redact.StringBuilder + rec.SafeFormatMinimal(&sb) + return fmt.Sprintf("%s%v\n", sb.String(), out) + + case "retain-ready-replica-target-stores-only": + in := dd.ScanArg[[]roachpb.StoreID](t, d, "in") + replicas, _ := dd.ScanArgOpt[[]roachpb.StoreID](t, d, "replicas") + var replicasSet storeSet + for _, replica := range replicas { + replicasSet.insert(replica) + } + out := retainReadyReplicaTargetStoresOnly(ctx, storeSet(in), cs.stores, replicasSet) rec := finishAndGet() var sb redact.StringBuilder rec.SafeFormatMinimal(&sb) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt index d4ed37ba94ec..61c77a69a356 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt @@ -124,6 +124,11 @@ retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 skipping s2 for lease transfer: replica lease disposition refusing (health ok) [1 3] +# The leaseholder is exempt from the disposition check. +retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 leaseholder=2 +---- +[1 2 3] + # Restore s2's replica disposition. store-leaseholder-msg store-id=1 @@ -137,3 +142,7 @@ store-id=1 retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 ---- [1 2 3] + +retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 leaseholder=2 +---- +[1 2 3] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt index 3f1e9da46fc9..bc8de2a9a72c 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt @@ -181,11 +181,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] start processing shedding store s1: cpu node load overloadSlow, store load overloadSlow, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r3:[cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadSlow >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r3 from s1: candidates are [3 5] [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=693 fractionUsed=90.00% meanUtil=83.20% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=693 fractionUsed=90.00% meanUtil=83.20% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 3 5] [mmaid=1] load summary for dim=CPURate (s3): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=693 fractionUsed=90.00% meanUtil=83.20% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -207,11 +207,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n5s5: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=loadNormal worst=CPURate cpu=loadNormal writes=loadNormal bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s5 for r3: delta load [cpu:180ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] considering lease-transfer r2 from s1: candidates are [2 4] [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=700 fractionUsed=90.00% meanUtil=84.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=700 fractionUsed=90.00% meanUtil=84.00% capacity=1000] -[mmaid=1] considering lease-transfer r2 from s1: candidates are [1 2 4] [mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=700 fractionUsed=90.00% meanUtil=84.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -233,11 +233,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n4s4: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=loadNormal worst=CPURate cpu=loadNormal writes=loadNormal bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s4 for r2: delta load [cpu:180ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=900 meanLoad=900 fractionUsed=90.00% meanUtil=90.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=900 meanLoad=900 fractionUsed=90.00% meanUtil=90.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 2 3] [mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates [mmaid=1] attempting to shed replicas next [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] @@ -261,6 +261,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] load summary for dim=ByteSize (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] [mmaid=1] load summary for dim=CPURate (n7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] [mmaid=1] considering replica-transfer r3 from s1: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s2: (store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s6: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s7: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=1] discarding candidates with higher load than lowestLoadSet(loadNormal): s2(SLS:overloadSlow, overloadedDimLoadSummary:overloadSlow), overloadedDim:CPURate [mmaid=1] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s6(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s7(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s6 [mmaid=1] load summary for dim=CPURate (s6): overloadUrgent, reason: fractionUsed > 90% [load=480 meanLoad=541 fractionUsed=96.00% meanUtil=75.80% capacity=500] @@ -274,6 +279,10 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n6s6: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s6 for r3: delta load [cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] [mmaid=1] load summary for dim=CPURate (s3): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -282,7 +291,20 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] load summary for dim=WriteBandwidth (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] [mmaid=1] load summary for dim=ByteSize (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] [mmaid=1] load summary for dim=CPURate (n5): loadLow, reason: load is >10% below mean [load=280 meanLoad=541 fractionUsed=56.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] [mmaid=1] considering replica-transfer r2 from s1: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s3: (store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s5: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s6: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s7: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=1] discarding candidates with higher load than lowestLoadSet(loadNormal): s3(SLS:overloadSlow, overloadedDimLoadSummary:overloadSlow), overloadedDim:CPURate [mmaid=1] sortTargetCandidateSetAndPick: candidates: s5(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s6(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s7(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s6 [mmaid=1] load summary for dim=CPURate (s6): overloadUrgent, reason: fractionUsed > 90% [load=480 meanLoad=541 fractionUsed=96.00% meanUtil=75.80% capacity=500] @@ -296,7 +318,32 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n6s6: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s6 for r2: delta load [cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=300 meanLoad=541 fractionUsed=60.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=300 meanLoad=541 fractionUsed=60.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s5): loadLow, reason: load is >10% below mean [load=280 meanLoad=541 fractionUsed=56.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n5): loadLow, reason: load is >10% below mean [load=280 meanLoad=541 fractionUsed=56.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] [mmaid=1] considering replica-transfer r1 from s1: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s5: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s6: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s7: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=1] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s5(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s6(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s7(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s7 [mmaid=1] load summary for dim=CPURate (s7): overloadUrgent, reason: fractionUsed > 90% [load=470 meanLoad=541 fractionUsed=94.00% meanUtil=75.80% capacity=500] [mmaid=1] load summary for dim=WriteBandwidth (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt index d403b8024c39..192a62144629 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt @@ -29,11 +29,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=0.4 [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r4:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r3:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r4 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r4 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -53,11 +53,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=0.4 [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(success): shedding r4 lease from s1 to s2 [change:r4=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:770ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.23) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r3 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt index 1da24fd04dbe..3e7c5ee0314e 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt @@ -69,20 +69,20 @@ rebalance-stores store-id=1 [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] skipping s2 for lease transfer: lease disposition refusing (health ok) [mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 3] [mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates [mmaid=1] attempting to shed replicas next -[mmaid=1] excluding all stores on n1 due to overload/fd status [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=700 fractionUsed=100.00% meanUtil=70.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=700 fractionUsed=100.00% meanUtil=70.00% capacity=1000] [mmaid=1] considering replica-transfer r1 from s1: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: [mmaid=1] result(failed): no candidates found for r1 after exclusions [mmaid=1] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] no top-K[CPURate] ranges found for s3 with lease on local s1 diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt index 58597c4f4f00..6c671b32b6bb 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt @@ -77,12 +77,12 @@ rebalance-stores store-id=1 [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] skipping s2 for lease transfer: replica lease disposition refusing (health ok) [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=900 fractionUsed=100.00% meanUtil=90.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=900 fractionUsed=100.00% meanUtil=90.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 3] [mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=800 meanLoad=900 fractionUsed=80.00% meanUtil=90.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt index 31fe7defbd37..ad88ed7c56fb 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt @@ -29,11 +29,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=1.0 max-lease-tr [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r4:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r3:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r4 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r4 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -53,11 +53,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=1.0 max-lease-tr [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(success): shedding r4 lease from s1 to s2 [change:r4=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:770ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.23) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r3 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt index 5e2a8bb03e8b..19832e40e1c2 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt @@ -28,11 +28,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r4:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r3:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r4 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r4 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -52,11 +52,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(success): shedding r4 lease from s1 to s2 [change:r4=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:770ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.23) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r3 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -77,11 +77,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] cannot add load to n2s2: due to target_summary(overloadSlow)>=loadNoChange,targetSLS.frac_pending(2.53or0.00>=epsilon) [mmaid=1] [target_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=2.53,0.00(false)),src_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.23(false))] [mmaid=1] result(failed): cannot shed from s1 to s2 for r3: delta load [cpu:230ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] considering lease-transfer r2 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r2 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -101,11 +101,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=540 meanLoad=400 fractionUsed=54.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n3s3: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.23(false))] [mmaid=1] result(success): shedding r2 lease from s1 to s3 [change:r2=[transfer_to=3 cids=3,4]] with resulting loads source:[cpu:540ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.46) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=540 meanLoad=400 fractionUsed=54.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=540 meanLoad=400 fractionUsed=54.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt index 684bb9538bbc..90a48acf5031 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt @@ -16,7 +16,6 @@ rebalance-stores store-id=1 max-range-move-count=1 fraction-pending-decrease-thr [mmaid=2] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=2] top-K[CPURate] ranges for s3 with lease on local s1: r3:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=2] attempting to shed replicas next -[mmaid=2] excluding all stores on n3 due to overload/fd status [mmaid=2] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=525 fractionUsed=100.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -26,6 +25,8 @@ rebalance-stores store-id=1 max-range-move-count=1 fraction-pending-decrease-thr [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=525 fractionUsed=10.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r3 from s3: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt index deded1f3501a..f77c7051691b 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt @@ -16,7 +16,6 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=2] top-K[CPURate] ranges for s3 with lease on local s1: r3:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=2] attempting to shed replicas next -[mmaid=2] excluding all stores on n3 due to overload/fd status [mmaid=2] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=525 fractionUsed=100.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -26,6 +25,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=525 fractionUsed=10.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r3 from s3: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt new file mode 100644 index 000000000000..f9d41de9b0b9 --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt @@ -0,0 +1,222 @@ +# This test verifies lateral (within-node) replica transfers are allowed. +# Stores on the shedding store's node are NOT excluded as targets, allowing +# replica movement between stores on the same node. +# +# Setup: +# - n1s1: has replica, leaseholder, not overloaded +# - n2s2: has replica, not overloaded +# - n3s3: has replica, write-bandwidth overloaded (trying to shed) +# - n3s4: same node as s3, no replica, not overloaded - VALID lateral target +# +# Expected: s3 sheds its replica to s4 (lateral transfer within n3). + +set-store + store-id=1 node-id=1 + store-id=2 node-id=2 + store-id=3 node-id=3 + store-id=4 node-id=3 +---- +node-id=1 locality-tiers=node=1 + store-id=1 attrs= +node-id=2 locality-tiers=node=2 + store-id=2 attrs= +node-id=3 locality-tiers=node=3 + store-id=3 attrs= + store-id=4 attrs= + +store-load-msg + store-id=1 node-id=1 load=[100,20000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s + store-id=2 node-id=2 load=[100,20000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s + store-id=3 node-id=3 load=[50,80000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s + store-id=4 node-id=3 load=[50,10000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s +---- + +store-leaseholder-msg +store-id=1 + range-id=1 load=[10,10000000,0] raft-cpu=10 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL + store-id=3 replica-id=3 type=VOTER_FULL + config=num_replicas=3 constraints={} voter_constraints={} +---- + +# First call establishes s3's overload state and enters grace period. +rebalance-stores store-id=1 +---- +[mmaid=1] rebalanceStores begins +[mmaid=1] cluster means: (stores-load [cpu:75ns/s, write-bandwidth:32 MB/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:100 MB/s, byte-size:1.0 kB]) (nodes-cpu-load 100) (nodes-cpu-capacity 1333) +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] evaluating s1: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] overload-continued s1 ((store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s1 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] evaluating s2: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] overload-continued s2 ((store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s2 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=32500000 fractionUsed=80.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] evaluating s3: node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=1] overload-continued s3 ((store=overloadUrgent worst=WriteBandwidth cpu=loadLow writes=overloadUrgent bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s3 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=32500000 fractionUsed=10.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] evaluating s4: node load loadNormal, store load loadNormal, worst dim ByteSize +[mmaid=1] start processing shedding store s1: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] local store s1 is CPU overloaded (overloadSlow >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=83 fractionUsed=5.00% meanUtil=8.33% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=40000000 fractionUsed=80.00% meanUtil=40.00% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] candidate store 2 was discarded due to (nls=false overloadDim=true pending_thresh=false): sls=(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] discarding candidates with higher load than loadThreshold(overloadSlow): s3(SLS:overloadUrgent, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate +[mmaid=1] sortTargetCandidateSetAndPick: no candidates due to load +[mmaid=1] result(failed): no candidates to move lease from n1s1 for r1 after sortTargetCandidateSetAndPick +[mmaid=1] attempting to shed replicas next +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] considering replica-transfer r1 from s1: store load [cpu:100ns/s, write-bandwidth:20 MB/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] result(failed): no candidates found for r1 after exclusions +[mmaid=1] start processing shedding store s2: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s2 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] skipping remote store s2: in lease shedding grace period +[mmaid=1] start processing shedding store s3: cpu node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=1] top-K[WriteBandwidth] ranges for s3 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] attempting to shed replicas next +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=32500000 fractionUsed=80.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=32500000 fractionUsed=10.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] considering replica-transfer r1 from s3: store load [cpu:50ns/s, write-bandwidth:80 MB/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s4: (store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:WriteBandwidth, picked s4 +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=61 meanLoad=75 fractionUsed=6.10% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=21000000 meanLoad=32500000 fractionUsed=21.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): overloadSlow, reason: load is >10% above mean [load=111 meanLoad=100 fractionUsed=5.55% meanUtil=7.50% capacity=2000] +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=40 meanLoad=75 fractionUsed=4.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadSlow, reason: fractionUsed < 75% and >1.75x meanUtil [load=70000000 meanLoad=32500000 fractionUsed=70.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=90 meanLoad=100 fractionUsed=4.50% meanUtil=7.50% capacity=2000] +[mmaid=1] cannot add load to n3s4: due to target_summary(overloadSlow)>=loadNoChange,target-node(overloadSlow)>target-store(loadNormal) +[mmaid=1] [target_sls:(store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] +[mmaid=1] result(failed): cannot shed from s3 to s4 for r1: delta load [cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] rebalancing pass failures (store,reason:count): (s1,no-cand-load:1), (s3,no-cand-to-accept-load:1) +pending(0) + +# Advance time beyond the lease shedding grace period. +tick seconds=300 +---- +t=5m0s + +# s3 is overloaded. Since the lease is on s1 (not s3), s3 can only shed via +# replica transfer. s4 is on the same node (n3) and is a valid lateral target. +# TODO(during review): s4 actually gets rejected on the last mile because for +# some reason the "due to target_summary(overloadSlow)>=loadNoChange" fires; +# I need to piece this together but this doesn't seem right. We're on a low-CPU +# node and simply trying to move IO between two stores of the same node. This +# seems like an unambigiously good idea (if we can't move the load elsewhere). +rebalance-stores store-id=1 +---- +[mmaid=2] rebalanceStores begins +[mmaid=2] cluster means: (stores-load [cpu:75ns/s, write-bandwidth:32 MB/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:100 MB/s, byte-size:1.0 kB]) (nodes-cpu-load 100) (nodes-cpu-capacity 1333) +[mmaid=2] evaluating s1: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] store s1 was added to shedding store list +[mmaid=2] evaluating s2: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] store s2 was added to shedding store list +[mmaid=2] evaluating s3: node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=2] store s3 was added to shedding store list +[mmaid=2] evaluating s4: node load loadNormal, store load loadNormal, worst dim ByteSize +[mmaid=2] start processing shedding store s1: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] local store s1 is CPU overloaded (overloadSlow >= overloadSlow), attempting lease transfers first +[mmaid=2] considering lease-transfer r1 from s1: candidates are [2 3] +[mmaid=2] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=83 fractionUsed=5.00% meanUtil=8.33% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=40000000 fractionUsed=80.00% meanUtil=40.00% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=2] candidate store 2 was discarded due to (nls=false overloadDim=true pending_thresh=false): sls=(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=2] discarding candidates with higher load than loadThreshold(overloadSlow): s3(SLS:overloadUrgent, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate +[mmaid=2] sortTargetCandidateSetAndPick: no candidates due to load +[mmaid=2] result(failed): no candidates to move lease from n1s1 for r1 after sortTargetCandidateSetAndPick +[mmaid=2] attempting to shed replicas next +[mmaid=2] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] considering replica-transfer r1 from s1: store load [cpu:100ns/s, write-bandwidth:20 MB/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] result(failed): no candidates found for r1 after exclusions +[mmaid=2] start processing shedding store s2: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] top-K[CPURate] ranges for s2 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] attempting to shed replicas next +[mmaid=2] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] considering replica-transfer r1 from s2: store load [cpu:100ns/s, write-bandwidth:20 MB/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] result(failed): no candidates found for r1 after exclusions +[mmaid=2] start processing shedding store s3: cpu node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=2] top-K[WriteBandwidth] ranges for s3 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] attempting to shed replicas next +[mmaid=2] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=32500000 fractionUsed=80.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=32500000 fractionUsed=10.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=2] considering replica-transfer r1 from s3: store load [cpu:50ns/s, write-bandwidth:80 MB/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:WriteBandwidth, picked s4 +[mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=61 meanLoad=75 fractionUsed=6.10% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=21000000 meanLoad=32500000 fractionUsed=21.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): overloadSlow, reason: load is >10% above mean [load=111 meanLoad=100 fractionUsed=5.55% meanUtil=7.50% capacity=2000] +[mmaid=2] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=40 meanLoad=75 fractionUsed=4.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s3): overloadSlow, reason: fractionUsed < 75% and >1.75x meanUtil [load=70000000 meanLoad=32500000 fractionUsed=70.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=90 meanLoad=100 fractionUsed=4.50% meanUtil=7.50% capacity=2000] +[mmaid=2] cannot add load to n3s4: due to target_summary(overloadSlow)>=loadNoChange,target-node(overloadSlow)>target-store(loadNormal) +[mmaid=2] [target_sls:(store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] +[mmaid=2] result(failed): cannot shed from s3 to s4 for r1: delta load [cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] rebalancing pass failures (store,reason:count): (s1,no-cand-load:1), (s2,no-cand:1), (s3,no-cand-to-accept-load:1) +pending(0) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt new file mode 100644 index 000000000000..863a3b958a7d --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt @@ -0,0 +1,106 @@ +# This test verifies that the multi-metric allocator skips stores that aren't +# ready to receive replicas when considering replica transfers. +# +# Setup: s1 holds the lease for r1, with replicas on s1, s2, s3. +# - s1: overloaded (wants to shed load) +# - s2, s3: also overloaded (not good targets) +# - s4: low load (would be ideal target) but marked as refusing replicas +# +# Expected: s4 is filtered out due to replica disposition. +# With no valid targets remaining, no replica transfer occurs. + +set-store + store-id=1 node-id=1 + store-id=2 node-id=2 + store-id=3 node-id=3 + store-id=4 node-id=4 +---- +node-id=1 locality-tiers=node=1 + store-id=1 attrs= +node-id=2 locality-tiers=node=2 + store-id=2 attrs= +node-id=3 locality-tiers=node=3 + store-id=3 attrs= +node-id=4 locality-tiers=node=4 + store-id=4 attrs= + +store-load-msg + store-id=1 node-id=1 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s + store-id=2 node-id=2 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s + store-id=3 node-id=3 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s + store-id=4 node-id=4 load=[100,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s +---- + +store-leaseholder-msg +store-id=1 + range-id=1 load=[100,0,0] raft-cpu=100 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL + store-id=3 replica-id=3 type=VOTER_FULL + config=num_replicas=3 constraints={} voter_constraints={} +---- + +# Mark s4 as refusing replicas. +set-store-status store-id=4 replicas=refusing +---- +ok refusing=replicas + +# s1 tries to shed load. Lease transfers fail (all other replica-holding stores +# are also overloaded). Replica transfer attempted but s4 is filtered out. +rebalance-stores store-id=1 +---- +[mmaid=1] rebalanceStores begins +[mmaid=1] cluster means: (stores-load [cpu:775ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:1.0 kB/s, byte-size:1.0 kB]) (nodes-cpu-load 775) (nodes-cpu-capacity 1000) +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s1: node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] overload-continued s1 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s1 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s2): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n2): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s2: node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] overload-continued s2 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s2 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s3: node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] overload-continued s3 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s3 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=100 meanLoad=775 fractionUsed=10.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=775 fractionUsed=10.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s4: node load loadLow, store load loadNormal, worst dim WriteBandwidth +[mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] +[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates +[mmaid=1] attempting to shed replicas next +[mmaid=1] skipping s4 for replica transfer: replica disposition refusing (health ok) +[mmaid=1] pre-means filtered 1 stores → remaining [1 2 3], means: store={[1000 0 0] [1000 1000 1000] [1 0 0] [0 0]} node={1000 1000 1} +[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] considering replica-transfer r1 from s1: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] result(failed): no candidates found for r1 after exclusions +[mmaid=1] start processing shedding store s2: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s2 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] skipping remote store s2: in lease shedding grace period +[mmaid=1] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s3 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] skipping remote store s3: in lease shedding grace period +[mmaid=1] rebalancing pass failures (store,reason:count): (s1,not-overloaded:1) +pending(0) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt index ce917a8fcafd..98cd2faf0510 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt @@ -19,7 +19,6 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=2] top-K[CPURate] ranges for s3 with lease on local s1: r3:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=2] attempting to shed replicas next -[mmaid=2] excluding all stores on n3 due to overload/fd status [mmaid=2] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=525 fractionUsed=100.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -29,6 +28,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=525 fractionUsed=10.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r3 from s3: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -49,6 +50,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r2 from s3: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=1.10,0.00(false)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=320 meanLoad=525 fractionUsed=32.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -69,6 +72,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=320 meanLoad=525 fractionUsed=32.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r1 from s3: store load [cpu:800ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=2.20,0.00(false)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=430 meanLoad=525 fractionUsed=43.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt new file mode 100644 index 000000000000..0575bcd9ce58 --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt @@ -0,0 +1,181 @@ +# Test retainReadyReplicaTargetStoresOnly which filters stores based on: +# 1. Health (must be HealthOK) +# 2. Store-level replica disposition (must be ReplicaDispositionOK) +# 3. High disk space utilization (>90%) +# +# Note: constraint-based exclusions (e.g., to avoid multiple replicas per node) +# are handled post-means at the caller level, not here. + +set-store + store-id=1 node-id=1 locality-tiers=region=us + store-id=2 node-id=2 locality-tiers=region=us + store-id=3 node-id=3 locality-tiers=region=us +---- +node-id=1 locality-tiers=region=us,node=1 + store-id=1 attrs= +node-id=2 locality-tiers=region=us,node=2 + store-id=2 attrs= +node-id=3 locality-tiers=region=us,node=3 + store-id=3 attrs= + +# Set up stores with normal disk usage (50% used). +store-load-msg + store-id=1 node-id=1 load=[100,0,50] capacity=[200,100,100] load-time=0s + store-id=2 node-id=2 load=[50,0,50] capacity=[200,100,100] load-time=0s + store-id=3 node-id=3 load=[50,0,50] capacity=[200,100,100] load-time=0s +---- + +store-leaseholder-msg +store-id=1 + range-id=1 load=[10,0,0] + config=(num_replicas=3) + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL leaseholder=false + store-id=3 replica-id=3 type=VOTER_FULL leaseholder=false +---- + +# All stores healthy and accepting replicas - all retained. +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Only input stores matter. +retain-ready-replica-target-stores-only in=(1,3) +---- +[1 3] + +# Mark s2 as shedding leases, which should have no effect +# since we're looking at replicas. +set-store-status store-id=2 leases=shedding +---- +ok shedding=leases + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Make store 2 unhealthy - but we rely only on the +# disposition, so no filtering occurs. +# This can't happen in production since unhealthy +# stores can't have a green disposition. +set-store-status store-id=2 health=unhealthy leases=ok +---- +unhealthy accepting all + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Different kind of unhealthy. Same result. +set-store-status store-id=2 health=unknown +---- +unknown accepting all + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Restore store 2, make store 3 refuse replicas at store level. +set-store-status store-id=2 health=ok +---- +ok accepting all + +set-store-status store-id=3 replicas=refusing +---- +ok refusing=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s3 for replica transfer: replica disposition refusing (health ok) +[1 2] + +# Shedding and refusing are treated the same. +set-store-status store-id=3 health=ok replicas=shedding +---- +ok shedding=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s3 for replica transfer: replica disposition shedding (health ok) +[1 2] + +# Restore store 3. +set-store-status store-id=3 replicas=ok +---- +ok accepting all + +# All stores ready again. +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Test high disk utilization filtering. +# Set store 2 to >90% disk usage. +store-load-msg + store-id=2 node-id=2 load=[50,0,95] capacity=[200,100,100] load-time=0s +---- + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s2 for replica transfer: high disk utilization (health ok) +[1 3] + +# Test combination of filters: s2 still high disk, +# s3 refusing replicas. +set-store-status store-id=3 replicas=refusing +---- +ok refusing=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s2 for replica transfer: high disk utilization (health ok) +skipping s3 for replica transfer: replica disposition refusing (health ok) +[1] + +# Test with `replicas` parameter: stores in this set bypass disposition checks +# since it already has the replica (their load should be in the mean regardless +# of whether it's accepting new replicas). + +# Reset stores to healthy. +set-store-status store-id=2 health=ok +---- +ok accepting all + +# Reset high disk on s2. +store-load-msg + store-id=2 node-id=2 load=[50,0,0] capacity=[200,100,100] load-time=0s +---- + +set-store-status store-id=3 replicas=ok +---- +ok accepting all + +# Make store 1 refuse replicas. +set-store-status store-id=1 replicas=refusing +---- +ok refusing=replicas + +# When not specified to already have a replica, s1 is filtered out. +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s1 for replica transfer: replica disposition refusing (health ok) +[2 3] + +# With existing replica on s1, s1 bypasses the disposition check and is retained. +# (It already has a replica, so its disposition is irrelevant for mean computation.) +retain-ready-replica-target-stores-only in=(1,2,3) replicas=(1) +---- +[1 2 3] + +# Even if s1 is ill-disposed, it's not filtered out when it's the shedding store. +# +# TODO(tbg): this is an interesting case that I need to understand +# better, especially if we also consider excluding ALL current replicas +# from the check. +set-store-status store-id=1 health=unhealthy replicas=refusing +---- +unhealthy refusing=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) replicas=(1) +---- +[1 2 3]