diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 2aac1a1ef7..f420a61241 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -17,6 +17,18 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "atomicptrmap_tid_vcpu",
+    out = "atomicptrmap_tid_vcpu_unsafe.go",
+    package = "kvm",
+    prefix = "vCPUsByTID",
+    template = "//pkg/sync/atomicptrmap:generic_atomicptrmap",
+    types = {
+        "Key": "uint64",
+        "Value": "vCPU",
+    },
+)
+
 config_setting(
     name = "debug_build",
     values = {
@@ -31,6 +43,7 @@ go_library(
         "address_space_amd64.go",
         "address_space_arm64.go",
         "atomicptr_machine_unsafe.go",
+        "atomicptrmap_tid_vcpu_unsafe.go",
         "bluepill.go",
         "bluepill_allocator.go",
         "bluepill_amd64.go",
@@ -76,6 +89,7 @@ go_library(
         "//pkg/context",
         "//pkg/cpuid",
         "//pkg/fd",
+        "//pkg/gohacks",
         "//pkg/hostarch",
         "//pkg/hostos",
         "//pkg/hostsyscall",
@@ -130,6 +144,7 @@ go_test(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/time",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -164,6 +179,7 @@ go_test(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/time",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index fe161acdd0..927d2d923a 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -70,14 +70,18 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
 	// Copy out registers.
 	bluepillArchExit(c, bluepillArchContext(context))
 
-	// Return to the vCPUReady state; notify any waiters.
-	user := c.state.Load() & vCPUUser
-	switch c.state.Swap(user) {
-	case user | vCPUGuest: // Expected case.
-	case user | vCPUGuest | vCPUWaiter:
+	// Unset vCPUGuest and vCPUWaiter; notify any waiters.
+	oldState := c.state.Load()
+	if oldState&vCPUGuest == 0 {
+		throw("invalid state in kvm.bluepillGuestExit")
+	}
+	newState := oldState &^ (vCPUGuest | vCPUWaiter)
+	oldState = c.state.Swap(newState)
+	if newState&vCPUStateMask == vCPUReady {
+		c.machine.availableNotify()
+	}
+	if oldState&vCPUWaiter != 0 {
 		c.notify()
-	default:
-		throw("invalid state")
 	}
 }
 
@@ -116,13 +120,14 @@ func bluepillHandler(context unsafe.Pointer) {
 	// Sanitize the registers; interrupts must always be disabled.
 	c := bluepillArchEnter(bluepillArchContext(context))
 
-	// Mark this as guest mode.
-	switch c.state.Swap(vCPUGuest | vCPUUser) {
-	case vCPUUser: // Expected case.
-	case vCPUUser | vCPUWaiter:
+	// Set vCPUGuest; unset vCPUWaiter; notify waiters.
+	state := c.state.Load()
+	if state&(vCPUUser|vCPUGuest) != vCPUUser {
+		throw("invalid state in bluepillHandler")
+	}
+	state = c.state.Swap((state | vCPUGuest) &^ vCPUWaiter)
+	if state&vCPUWaiter != 0 {
 		c.notify()
-	default:
-		throw("invalid state")
 	}
 
 	for {
diff --git a/pkg/sentry/platform/kvm/config.go b/pkg/sentry/platform/kvm/config.go
index 8cf8fa5c87..76d28876db 100644
--- a/pkg/sentry/platform/kvm/config.go
+++ b/pkg/sentry/platform/kvm/config.go
@@ -18,6 +18,16 @@
 package kvm
 
 // Config sets configuration options for each platform instance.
-type Config struct{}
+type Config struct {
+	// MaxVCPUs is the maximum number of vCPUs the platform instance will
+	// create. If MaxVCPUs is 0, the platform will choose a reasonable default.
+	MaxVCPUs int
+}
 
-func (*machine) applyConfig(config *Config) error { return nil }
+func (m *machine) applyConfig(config *Config) error {
+	if config.MaxVCPUs < 0 {
+		return fmt.Errorf("invalid Config.MaxVCPUs: %d", config.MaxVCPUs)
+	}
+	m.maxVCPUs = config.MaxVCPUs
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 69070b3e88..c43e88dbb4 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -16,6 +16,7 @@
 package kvm
 
 import (
+	"encoding/binary"
 	"fmt"
 
 	"golang.org/x/sys/unix"
@@ -76,6 +77,11 @@ type KVM struct {
 var (
 	globalOnce sync.Once
 	globalErr  error
+
+	// vCPUStateOffset is the offset in bytes from the start of vCPU.state to
+	// the start of the lower 4 bytes of vCPU.state, which varies depending on
+	// endianness.
+	vCPUStateOffset uintptr
 )
 
 // OpenDevice opens the KVM device and returns the File.
@@ -97,6 +103,18 @@ func New(deviceFile *fd.FD, config Config) (*KVM, error) {
 
 	// Ensure global initialization is done.
 	globalOnce.Do(func() {
+		var buf [8]byte
+		binary.NativeEndian.PutUint64(buf[:], 0x1122334455667788)
+		switch binary.NativeEndian.Uint32(buf[0:4]) {
+		case 0x11223344: // big endian
+			vCPUStateOffset = 4
+		case 0x55667788: // little endian
+			vCPUStateOffset = 0
+		default:
+			globalErr = fmt.Errorf("failed to determine endianness for kvm.vCPUStateOffset")
+			return
+		}
+
 		globalErr = updateGlobalOnce(int(fd))
 	})
 	if globalErr != nil {
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 48a501c016..dd17d0d3ba 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -18,6 +18,7 @@ import (
 	"math/rand"
 	"os"
 	"reflect"
+	"runtime"
 	"testing"
 	"time"
 
@@ -32,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
 	ktime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // dummyFPState is initialized in TestMain.
@@ -90,7 +92,7 @@ func bluepillTest(t testHarness, fn func(*vCPU)) {
 func TestKernelSyscall(t *testing.T) {
 	bluepillTest(t, func(c *vCPU) {
 		redpill() // Leave guest mode.
-		if got := c.state.Load(); got != vCPUUser {
+		if got := c.state.Load(); got&vCPUStateMask != vCPUUser {
 			t.Errorf("vCPU not in ready state: got %v", got)
 		}
 	})
@@ -108,7 +110,7 @@ func TestKernelFault(t *testing.T) {
 	hostFault() // Ensure recovery works.
 	bluepillTest(t, func(c *vCPU) {
 		hostFault()
-		if got := c.state.Load(); got != vCPUUser {
+		if got := c.state.Load(); got&vCPUStateMask != vCPUUser {
 			t.Errorf("vCPU not in ready state: got %v", got)
 		}
 	})
@@ -310,6 +312,12 @@ func TestBounceStress(t *testing.T) {
 				time.Sleep(time.Duration(n) * time.Microsecond)
 			}
 		}
+		m := c.machine
+		// Lock to this thread so that m.Get() always gives us the same vCPU,
+		// since applicationTest() => kvmTest() assumes that we return holding
+		// c.
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
 		for i := 0; i < 1000; i++ {
 			// Start an asynchronously executing goroutine that
 			// calls Bounce at pseudo-random point in time.
@@ -328,9 +336,11 @@ func TestBounceStress(t *testing.T) {
 			}, &si); err != platform.ErrContextInterrupt {
 				t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
 			}
-			c.unlock()
+			m.Put(c)
 			randomSleep()
-			c.lock()
+			if c2 := m.Get(); c != c2 {
+				t.Fatalf("machine.Get() changed vCPUs from %p to %p", c, c2)
+			}
 		}
 		return false
 	})
@@ -480,6 +490,63 @@ func TestKernelVDSO(t *testing.T) {
 	})
 }
 
+// Regression test for b/404271139.
+func TestSingleVCPU(t *testing.T) {
+	// Create the machine.
+	deviceFile, err := OpenDevice("")
+	if err != nil {
+		t.Fatalf("error opening device file: %v", err)
+	}
+	k, err := New(deviceFile, Config{
+		MaxVCPUs: 1,
+	})
+	if err != nil {
+		t.Fatalf("error creating KVM instance: %v", err)
+	}
+	defer k.machine.Destroy()
+
+	// Ping-pong the single vCPU between two goroutines. The test passes if
+	// this does not deadlock.
+	stopC := make(chan struct{})
+	var doneWG sync.WaitGroup
+	defer func() {
+		close(stopC)
+		doneWG.Wait()
+	}()
+	var wakeC [2]chan struct{}
+	for i := range wakeC {
+		wakeC[i] = make(chan struct{}, 1)
+	}
+	for i := range wakeC {
+		doneWG.Add(1)
+		go func(i int) {
+			defer doneWG.Done()
+			for {
+				// Multiple ready channels in a select statement are chosen
+				// from randomly, so have a separate non-blocking receive from
+				// stopC first to ensure that it's honored in deterministic
+				// time.
+				select {
+				case <-stopC:
+					return
+				default:
+				}
+				select {
+				case <-stopC:
+					return
+				case <-wakeC[i]:
+					c := k.machine.Get()
+					bluepill(c)
+					wakeC[1-i] <- struct{}{}
+					k.machine.Put(c)
+				}
+			}
+		}(i)
+	}
+	wakeC[0] <- struct{}{}
+	time.Sleep(time.Second)
+}
+
 func BenchmarkApplicationSyscall(b *testing.B) {
 	var (
 		i int // Iteration includes machine.Get() / machine.Put().
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index a89dea1706..ad8aae5c76 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -57,23 +57,24 @@ type machine struct {
 	// kernel is the set of global structures.
 	kernel ring0.Kernel
 
-	// mu protects vCPUs.
-	mu sync.RWMutex
-
-	// available is notified when vCPUs are available.
-	available sync.Cond
-
 	// vCPUsByTID are the machine vCPUs.
 	//
 	// These are populated dynamically.
-	vCPUsByTID map[uint64]*vCPU
+	vCPUsByTID vCPUsByTIDAtomicPtrMap
+
+	// availableWaiters is the number of goroutines waiting for a vCPU to
+	// become ready.
+	availableWaiters atomicbitops.Int32
+
+	// availableSeq is incremented whenever a vCPU becomes ready.
+	availableSeq atomicbitops.Uint32
 
 	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
 	vCPUsByID []*vCPU
 
 	// usedVCPUs is the number of vCPUs that have been used from the
 	// vCPUsByID pool.
-	usedVCPUs int
+	usedVCPUs atomicbitops.Int32
 
 	// maxVCPUs is the maximum number of vCPUs supported by the machine.
 	maxVCPUs int
@@ -88,20 +89,25 @@ type machine struct {
 	usedSlots []uintptr
 }
 
+// Bits in vCPU.state:
 const (
 	// vCPUReady is an alias for all the below clear.
-	vCPUReady uint32 = 0
+	vCPUReady = 0
 
 	// vCPUser indicates that the vCPU is in or about to enter user mode.
-	vCPUUser uint32 = 1 << 0
+	vCPUUser = 1 << 0
 
 	// vCPUGuest indicates the vCPU is in guest mode.
-	vCPUGuest uint32 = 1 << 1
+	vCPUGuest = 1 << 1
 
 	// vCPUWaiter indicates that there is a waiter.
 	//
 	// If this is set, then notify must be called on any state transitions.
-	vCPUWaiter uint32 = 1 << 2
+	vCPUWaiter = 1 << 2
+
+	vCPUTIDShift  = 32
+	vCPUStateMask = (uint64(1) << vCPUTIDShift) - 1
+	vCPUTIDMask   = ^vCPUStateMask
 )
 
 // Field values for the get_vcpu metric acquisition path used.
@@ -181,9 +187,6 @@ type vCPU struct {
 	// fd is the vCPU fd.
 	fd int
 
-	// tid is the last set tid.
-	tid atomicbitops.Uint64
-
 	// userExits is the count of user exits.
 	userExits atomicbitops.Uint64
 
@@ -193,10 +196,10 @@ type vCPU struct {
 	// faults is a count of world faults (informational only).
 	faults uint32
 
-	// state is the vCPU state.
-	//
-	// This is a bitmask of the three fields (vCPU*) described above.
-	state atomicbitops.Uint32
+	// The bottom 32 bits of state is a bitset of state fields; see vCPUUser
+	// and company above. If vCPUGuest or vCPUUser are set, then the top 32
+	// bits of state are the thread ID of the thread that has set them.
+	state atomicbitops.Uint64
 
 	// runData for this vCPU.
 	runData *runData
@@ -270,7 +273,6 @@ var forceMappingEntireAddressSpace = false
 func newMachine(vm int, config *Config) (*machine, error) {
 	// Create the machine.
 	m := &machine{fd: vm}
-	m.available.L = &m.mu
 
 	if err := m.applyConfig(config); err != nil {
 		panic(fmt.Sprintf("error setting config parameters: %s", err))
@@ -279,7 +281,6 @@ func newMachine(vm int, config *Config) (*machine, error) {
 	// Pull the maximum vCPUs.
 	m.getMaxVCPU()
 	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
-	m.vCPUsByTID = make(map[uint64]*vCPU)
 	m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
 	m.kernel.Init(m.maxVCPUs)
 
@@ -510,109 +511,118 @@ func (m *machine) Destroy() {
 // the current context in guest, the vCPU of it must be the same as what
 // Get() returns.
 func (m *machine) Get() *vCPU {
-	m.mu.RLock()
 	runtime.LockOSThread()
 	tid := hosttid.Current()
 
 	// Check for an exact match.
-	if c := m.vCPUsByTID[tid]; c != nil {
-		c.lock()
-		m.mu.RUnlock()
-		getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
-		return c
+	if c := m.vCPUsByTID.Load(tid); c != nil {
+		// If this succeeds, we can ignore vCPUGuest since it just indicates
+		// that we're already in guest mode.
+		if state := c.state.Load(); state>>vCPUTIDShift == tid && c.state.CompareAndSwap(state, state|vCPUUser) {
+			getVCPUCounter.Increment(&getVCPUAcquisitionFastReused)
+			return c
+		}
 	}
 
-	// The happy path failed. We now proceed to acquire an exclusive lock
-	// (because the vCPU map may change), and scan all available vCPUs.
-	// In this case, we first unlock the OS thread. Otherwise, if mu is
-	// not available, the current system thread will be parked and a new
-	// system thread spawned. We avoid this situation by simply refreshing
-	// tid after relocking the system thread.
-	m.mu.RUnlock()
-	runtime.UnlockOSThread()
-	m.mu.Lock()
-
+	// Get vCPU from the m.vCPUsByID pool.
+	owned := (tid << vCPUTIDShift) | vCPUUser
 	for {
-		runtime.LockOSThread()
-		tid = hosttid.Current()
-
-		// Recheck for an exact match.
-		if c := m.vCPUsByTID[tid]; c != nil {
-			c.lock()
-			m.mu.Unlock()
-			getVCPUCounter.Increment(&getVCPUAcquisitionReused)
-			return c
+		usedVCPUs := m.usedVCPUs.Load()
+		if int(usedVCPUs) >= m.maxVCPUs {
+			break
 		}
+		// XXX switch to Add
+		if !m.usedVCPUs.CompareAndSwap(usedVCPUs, usedVCPUs+1) {
+			continue
+		}
+		c := m.vCPUsByID[usedVCPUs]
+		c.state.Store(owned)
+		m.vCPUsByTID.Store(tid, c)
+		c.loadSegments()
+		getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
+		return c
+	}
 
-		// Get vCPU from the m.vCPUsByID pool.
-		if m.usedVCPUs < m.maxVCPUs {
-			c := m.vCPUsByID[m.usedVCPUs]
-			m.usedVCPUs++
-			c.lock()
-			m.vCPUsByTID[tid] = c
-			m.mu.Unlock()
-			c.loadSegments(tid)
+	// Scan for an available vCPU.
+	for origTID, c := range m.vCPUsByTID.RangeRepeatable {
+		// We can't steal vCPUs from other threads that are in guest mode.
+		if state := c.state.Load(); state&vCPUStateMask == vCPUReady && c.state.CompareAndSwap(state, owned) {
+			m.vCPUsByTID.CompareAndSwap(origTID, c, nil)
+			m.vCPUsByTID.Store(tid, c)
+			c.loadSegments()
 			getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 			return c
 		}
+	}
 
-		// Scan for an available vCPU.
-		for origTID, c := range m.vCPUsByTID {
-			if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
-				delete(m.vCPUsByTID, origTID)
-				m.vCPUsByTID[tid] = c
-				m.mu.Unlock()
-				c.loadSegments(tid)
+	// Wait for an available vCPU.
+	m.availableWaiters.Add(1)
+	for {
+		epoch := m.availableSeq.Load()
+		for origTID, c := range m.vCPUsByTID.RangeRepeatable {
+			if state := c.state.Load(); state&vCPUStateMask == vCPUReady && c.state.CompareAndSwap(state, owned) {
+				m.vCPUsByTID.CompareAndSwap(origTID, c, nil)
+				m.vCPUsByTID.Store(tid, c)
+				m.availableWaiters.Add(-1)
+				c.loadSegments()
 				getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 				return c
 			}
 		}
 
-		// Scan for something not in user mode.
-		for origTID, c := range m.vCPUsByTID {
-			if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
-				continue
-			}
+		// All vCPUs are already in guest mode. Wait until a vCPU becomes
+		// available. m.availableWait() blocks in the host, but unlocking the
+		// OS thread still makes waking up less expensive if sysmon steals our
+		// P while we're blocked.
+		runtime.UnlockOSThread()
+		m.availableWait(epoch)
+		runtime.LockOSThread()
+		tid = hosttid.Current()
+		owned = (tid << vCPUTIDShift) | vCPUUser
 
-			// The vCPU is not be able to transition to
-			// vCPUGuest|vCPUWaiter or to vCPUUser because that
-			// transition requires holding the machine mutex, as we
-			// do now. There is no path to register a waiter on
-			// just the vCPUReady state.
-			for {
-				c.waitUntilNot(vCPUGuest | vCPUWaiter)
-				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
-					break
-				}
+		// Recheck for an exact match.
+		if c := m.vCPUsByTID.Load(tid); c != nil {
+			if state := c.state.Load(); state>>vCPUTIDShift == tid && c.state.CompareAndSwap(state, state|vCPUUser) {
+				m.availableWaiters.Add(-1)
+				getVCPUCounter.Increment(&getVCPUAcquisitionReused)
+				return c
 			}
-
-			// Steal the vCPU.
-			delete(m.vCPUsByTID, origTID)
-			m.vCPUsByTID[tid] = c
-			m.mu.Unlock()
-			c.loadSegments(tid)
-			getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
-			return c
 		}
-
-		// Everything is executing in user mode. Wait until something is
-		// available. As with m.mu.Lock() above, unlock the OS thread while we
-		// do this to avoid spawning additional system threads. Note that
-		// signaling the condition variable will have the extra effect of
-		// kicking the vCPUs out of guest mode if that's where they were.
-		runtime.UnlockOSThread()
-		m.available.Wait()
 	}
 }
 
 // Put puts the current vCPU.
 func (m *machine) Put(c *vCPU) {
-	c.unlock()
-	runtime.UnlockOSThread()
+	// Fast path:
+	if c.stateLower().CompareAndSwap(vCPUUser|vCPUGuest, vCPUGuest) {
+		runtime.UnlockOSThread()
+		return
+	}
 
-	m.mu.RLock()
-	m.available.Signal()
-	m.mu.RUnlock()
+	var (
+		oldState uint64
+		newState uint64
+	)
+	// Unset vCPUUser and vCPUWaiter; notify waiters. This needs to CAS since
+	// we might leave guest mode between c.state.Load() and
+	// c.state.CompareAndSwap().
+	for {
+		oldState = c.state.Load()
+		if oldState&vCPUUser == 0 {
+			panic("putting vCPU not locked by Get")
+		}
+		newState = oldState &^ (vCPUUser | vCPUWaiter)
+		if c.state.CompareAndSwap(oldState, newState) {
+			break
+		}
+	}
+	runtime.UnlockOSThread()
+	if newState&vCPUStateMask == vCPUReady {
+		m.availableNotify()
+	}
+	if oldState&vCPUWaiter != 0 {
+		c.notify()
+	}
 }
 
 // newDirtySet returns a new dirty set.
@@ -625,9 +635,6 @@ func (m *machine) newDirtySet() *dirtySet {
 
 // dropPageTables drops cached page table entries.
 func (m *machine) dropPageTables(pt *pagetables.PageTables) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
 	// Clear from all PCIDs.
 	for _, c := range m.vCPUsByID {
 		if c != nil && c.PCIDs != nil {
@@ -636,57 +643,6 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
 	}
 }
 
-// lock marks the vCPU as in user mode.
-//
-// This should only be called directly when known to be safe, i.e. when
-// the vCPU is owned by the current TID with no chance of theft.
-//
-//go:nosplit
-func (c *vCPU) lock() {
-	atomicbitops.OrUint32(&c.state, vCPUUser)
-}
-
-// unlock clears the vCPUUser bit.
-//
-//go:nosplit
-func (c *vCPU) unlock() {
-	origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
-	if origState == vCPUUser|vCPUGuest {
-		// Happy path: no exits are forced, and we can continue
-		// executing on our merry way with a single atomic access.
-		return
-	}
-
-	// Clear the lock.
-	for {
-		state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser)
-		if state == origState {
-			break
-		}
-		origState = state
-	}
-	switch origState {
-	case vCPUUser:
-		// Normal state.
-	case vCPUUser | vCPUGuest | vCPUWaiter:
-		// Force a transition: this must trigger a notification when we
-		// return from guest mode. We must clear vCPUWaiter here
-		// anyways, because BounceToKernel will force a transition only
-		// from ring3 to ring0, which will not clear this bit. Halt may
-		// workaround the issue, but if there is no exception or
-		// syscall in this period, BounceToKernel will hang.
-		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
-		c.notify()
-	case vCPUUser | vCPUWaiter:
-		// Waiting for the lock to be released; the responsibility is
-		// on us to notify the waiter and clear the associated bit.
-		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
-		c.notify()
-	default:
-		panic("invalid state")
-	}
-}
-
 // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
 //
 //go:nosplit
@@ -704,7 +660,7 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 	origGuestExits := c.guestExits.Load()
 	origUserExits := c.userExits.Load()
 	for {
-		switch state := c.state.Load(); state {
+		switch state := c.state.Load(); state & vCPUStateMask {
 		case vCPUReady, vCPUWaiter:
 			// There is nothing to be done, we're already in the
 			// kernel pre-acquisition. The Bounce criteria have
@@ -721,7 +677,7 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 			// come from the bluepill handler.
 			c.waitUntilNot(state)
 		case vCPUGuest, vCPUUser | vCPUGuest:
-			if state == vCPUGuest && !forceGuestExit {
+			if state&vCPUStateMask == vCPUGuest && !forceGuestExit {
 				// The vCPU is already not acquired, so there's
 				// no need to do a fresh injection here.
 				return
@@ -737,7 +693,7 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 				// under memory pressure. Since we already
 				// marked ourselves as a waiter, we need to
 				// ensure that a signal is actually delivered.
-				if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil {
+				if err := unix.Tgkill(pid, int(state>>vCPUTIDShift), bounceSignal); err == nil {
 					break
 				} else if err.(unix.Errno) == unix.EAGAIN {
 					continue
@@ -747,7 +703,7 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 				}
 			}
 		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
-			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
+			if state&vCPUStateMask == vCPUGuest|vCPUWaiter && !forceGuestExit {
 				// See above.
 				return
 			}
@@ -756,7 +712,7 @@ func (c *vCPU) bounce(forceGuestExit bool) {
 			c.waitUntilNot(state)
 		default:
 			// Should not happen: the above is exhaustive.
-			panic("invalid state")
+			panic("invalid state in kvm.vCPU.bounce")
 		}
 
 		// Check if we've missed the state transition, but
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 342013d90b..acc854f1de 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -49,11 +49,9 @@ func (m *machine) initArchState() error {
 	}
 
 	// Initialize all vCPUs to minimize kvm ioctl-s allowed by seccomp filters.
-	m.mu.Lock()
 	for i := 0; i < m.maxVCPUs; i++ {
 		m.createVCPU(i)
 	}
-	m.mu.Unlock()
 
 	return nil
 }
@@ -497,21 +495,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
 
 // getMaxVCPU get max vCPU number
 func (m *machine) getMaxVCPU() {
+	if m.maxVCPUs == 0 {
+		// The goal here is to avoid vCPU contentions for reasonable workloads.
+		// But "reasonable" isn't defined well in this case. Let's say that CPU
+		// overcommit with factor 2 is still acceptable. We allocate a set of
+		// vCPU for each goruntime processor (P) and two sets of vCPUs to run
+		// user code.
+		m.maxVCPUs = 3 * runtime.GOMAXPROCS(0)
+	}
+
+	// Apply KVM limit.
 	maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
 	if errno != 0 {
-		m.maxVCPUs = _KVM_NR_VCPUS
-	} else {
-		m.maxVCPUs = int(maxVCPUs)
+		maxVCPUs = _KVM_NR_VCPUS
 	}
-
-	// The goal here is to avoid vCPU contentions for reasonable workloads.
-	// But "reasonable" isn't defined well in this case. Let's say that CPU
-	// overcommit with factor 2 is still acceptable. We allocate a set of
-	// vCPU for each goruntime processor (P) and two sets of vCPUs to run
-	// user code.
-	rCPUs := runtime.GOMAXPROCS(0)
-	if 3*rCPUs < m.maxVCPUs {
-		m.maxVCPUs = 3 * rCPUs
+	if m.maxVCPUs > int(maxVCPUs) {
+		m.maxVCPUs = int(maxVCPUs)
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 32a79939af..d17f7a26b8 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -35,7 +35,7 @@ func rdgsbase() uint64
 // This may be called from within the signal context and throws on error.
 //
 //go:nosplit
-func (c *vCPU) loadSegments(tid uint64) {
+func (c *vCPU) loadSegments() {
 	if errno := hostsyscall.RawSyscallErrno(unix.SYS_SIGALTSTACK, 0, uintptr(unsafe.Pointer(&c.signalStack)), 0); errno != 0 {
 		throw("sigaltstack")
 	}
@@ -58,7 +58,6 @@ func (c *vCPU) loadSegments(tid uint64) {
 			throw("getting GS segment")
 		}
 	}
-	c.tid.Store(tid)
 }
 
 // setCPUID sets the CPUID to be used by the guest.
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index bcc7fb7760..f60e8fff60 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -185,16 +185,12 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType,
 
 // getMaxVCPU get max vCPU number
 func (m *machine) getMaxVCPU() {
-	rmaxVCPUs := runtime.NumCPU()
-	smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
-	// compare the max vcpu number from runtime and syscall, use smaller one.
-	if errno != 0 {
-		m.maxVCPUs = rmaxVCPUs
-	} else {
-		if rmaxVCPUs < int(smaxVCPUs) {
-			m.maxVCPUs = rmaxVCPUs
-		} else {
-			m.maxVCPUs = int(smaxVCPUs)
-		}
+	if m.maxVCPUs == 0 {
+		m.maxVCPUs = runtime.NumCPU()
+	}
+
+	// Apply KVM limit.
+	if maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS); errno == 0 && m.maxVCPUs > int(maxVCPUs) {
+		m.maxVCPUs = int(maxVCPUs)
 	}
 }
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 946129da89..6727f70f51 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -53,11 +53,9 @@ func (m *machine) initArchState() error {
 	// The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms.
 	// If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time.
 	// For more detail, please refer to https://github.com/google/gvisor/issues/5739
-	m.mu.Lock()
 	for i := 0; i < m.maxVCPUs; i++ {
 		m.createVCPU(i)
 	}
-	m.mu.Unlock()
 	return nil
 }
 
@@ -259,10 +257,9 @@ func (c *vCPU) setSystemTime() error {
 }
 
 //go:nosplit
-func (c *vCPU) loadSegments(tid uint64) {
+func (c *vCPU) loadSegments() {
 	// TODO(gvisor.dev/issue/1238):  TLS is not supported.
 	// Get TLS from tpidr_el0.
-	c.tid.Store(tid)
 }
 
 func (c *vCPU) setOneRegister(reg *kvmOneReg) error {
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index cf2e36c5c7..15808b643f 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -114,6 +114,49 @@ func (a *atomicAddressSpace) get() *addressSpace {
 	return (*addressSpace)(atomic.LoadPointer(&a.pointer))
 }
 
+// availableNotify is called when a vCPU's state transitions to vCPUReady.
+//
+//go:nosplit
+func (m *machine) availableNotify() {
+	m.availableSeq.Add(1)
+	if m.availableWaiters.Load() == 0 {
+		return
+	}
+	errno := hostsyscall.RawSyscallErrno( // escapes: no.
+		unix.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&m.availableSeq)),
+		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
+		1)
+	if errno != 0 {
+		throw("futex wake error")
+	}
+}
+
+// availableWait blocks until availableNotify is called.
+//
+// Preconditions:
+// - epoch was the value of m.availableSeq before the caller last checked that
+// no vCPUs were in state vCPUReady.
+// - m.availableWaiters must be non-zero.
+//
+//go:nosplit
+func (m *machine) availableWait(epoch uint32) {
+	_, _, errno := unix.Syscall6(
+		unix.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&m.availableSeq)),
+		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
+		uintptr(epoch),
+		0, 0, 0)
+	if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN {
+		panic("futex wait error")
+	}
+}
+
+// stateLower returns a pointer to the lower 32 bits of c.state.
+func (c *vCPU) stateLower() *atomicbitops.Uint32 {
+	return (*atomicbitops.Uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&c.state)) + vCPUStateOffset))
+}
+
 // notify notifies that the vCPU has transitioned modes.
 //
 // This may be called by a signal handler and therefore throws on error.
@@ -122,7 +165,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
 func (c *vCPU) notify() {
 	errno := hostsyscall.RawSyscallErrno( // escapes: no.
 		unix.SYS_FUTEX,
-		uintptr(unsafe.Pointer(&c.state)),
+		uintptr(unsafe.Pointer(c.stateLower())),
 		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
 		// Number of waiters.
 		math.MaxInt32)
@@ -137,12 +180,12 @@ func (c *vCPU) notify() {
 // appropriate action to cause a transition (e.g. interrupt injection).
 //
 // This panics on error.
-func (c *vCPU) waitUntilNot(state uint32) {
+func (c *vCPU) waitUntilNot(state uint64) {
 	_, _, errno := unix.Syscall6(
 		unix.SYS_FUTEX,
-		uintptr(unsafe.Pointer(&c.state)),
+		uintptr(unsafe.Pointer(c.stateLower())),
 		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
-		uintptr(state),
+		uintptr(uint32(state)),
 		0, 0, 0)
 	if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN {
 		panic("futex wait error")
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index 2769ab5a92..b9381e69e6 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -17,6 +17,7 @@ package vfs
 import (
 	"fmt"
 	"math/bits"
+	"math/rand/v2"
 	"sync/atomic"
 	"unsafe"
 
@@ -38,7 +39,7 @@ type mountKey struct {
 
 var (
 	mountKeyHasher = sync.MapKeyHasher(map[mountKey]struct{}(nil))
-	mountKeySeed   = sync.RandUintptr()
+	mountKeySeed   = uintptr(rand.Uint64())
 )
 
 func (k *mountKey) hash() uintptr {
diff --git a/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go b/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go
index 8324d4c0fc..646ed6caff 100644
--- a/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go
+++ b/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go
@@ -17,6 +17,7 @@
 package atomicptrmap
 
 import (
+	"math/rand/v2"
 	"sync/atomic"
 	"unsafe"
 
@@ -56,7 +57,7 @@ type defaultHasher struct {
 // Init initializes the Hasher.
 func (h *defaultHasher) Init() {
 	h.fn = sync.MapKeyHasher(map[Key]*Value(nil))
-	h.seed = sync.RandUintptr()
+	h.seed = uintptr(rand.Uint64())
 }
 
 // Hash returns the hash value for the given Key.
@@ -430,15 +431,18 @@ func (shard *apmShard) rehash(oldSlots unsafe.Pointer) {
 //
 // f must not call other methods on m.
 func (m *AtomicPtrMap) Range(f func(key Key, val *Value) bool) {
+	// Randomize start position for consistency with Go maps.
+	start := uintptr(rand.Uint64())
+
 	for si := 0; si < len(m.shards); si++ {
 		shard := &m.shards[si]
-		if !shard.doRange(f) {
+		if !shard.doRange(f, start) {
 			return
 		}
 	}
 }
 
-func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool {
+func (shard *apmShard) doRange(f func(key Key, val *Value) bool, start uintptr) bool {
 	// We have to lock rehashMu because if we handled races with rehashing by
 	// retrying, f could see the same key twice.
 	shard.rehashMu.Lock()
@@ -448,14 +452,19 @@ func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool {
 		return true
 	}
 	mask := shard.mask
-	for i := uintptr(0); i <= mask; i++ {
+	start &= mask
+	i := start
+	for {
 		slot := apmSlotAt(slots, i)
 		slotVal := atomic.LoadPointer(&slot.val)
-		if slotVal == nil || slotVal == tombstone() {
-			continue
+		if slotVal != nil && slotVal != tombstone() {
+			if !f(slot.key, (*Value)(slotVal)) {
+				return false
+			}
 		}
-		if !f(slot.key, (*Value)(slotVal)) {
-			return false
+		i = (i + 1) & mask
+		if i == start {
+			break
 		}
 	}
 	return true
@@ -469,6 +478,9 @@ func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool {
 //
 //   - It is safe for f to call other methods on m.
 func (m *AtomicPtrMap) RangeRepeatable(f func(key Key, val *Value) bool) {
+	// Randomize start position for consistency with Go maps.
+	start := uintptr(rand.Uint64())
+
 	for si := 0; si < len(m.shards); si++ {
 		shard := &m.shards[si]
 
@@ -483,17 +495,22 @@ func (m *AtomicPtrMap) RangeRepeatable(f func(key Key, val *Value) bool) {
 			continue
 		}
 
-		for i := uintptr(0); i <= mask; i++ {
+		startMask := start & mask
+		i := startMask
+		for {
 			slot := apmSlotAt(slots, i)
 			slotVal := atomic.LoadPointer(&slot.val)
 			if slotVal == evacuated() {
 				goto retry
 			}
-			if slotVal == nil || slotVal == tombstone() {
-				continue
+			if slotVal != nil && slotVal != tombstone() {
+				if !f(slot.key, (*Value)(slotVal)) {
+					return
+				}
 			}
-			if !f(slot.key, (*Value)(slotVal)) {
-				return
+			i = (i + 1) & mask
+			if i == startMask {
+				break
 			}
 		}
 	}
diff --git a/pkg/sync/runtime_unsafe.go b/pkg/sync/runtime_unsafe.go
index 5bc0a92e07..8c5882da74 100644
--- a/pkg/sync/runtime_unsafe.go
+++ b/pkg/sync/runtime_unsafe.go
@@ -73,27 +73,6 @@ func Goready(gp uintptr, traceskip int, wakep bool) {
 	}
 }
 
-// Rand32 returns a non-cryptographically-secure random uint32.
-func Rand32() uint32 {
-	return fastrand()
-}
-
-// Rand64 returns a non-cryptographically-secure random uint64.
-func Rand64() uint64 {
-	return uint64(fastrand())<<32 | uint64(fastrand())
-}
-
-//go:linkname fastrand runtime.fastrand
-func fastrand() uint32
-
-// RandUintptr returns a non-cryptographically-secure random uintptr.
-func RandUintptr() uintptr {
-	if unsafe.Sizeof(uintptr(0)) == 4 {
-		return uintptr(Rand32())
-	}
-	return uintptr(Rand64())
-}
-
 // MapKeyHasher returns a hash function for pointers of m's key type.
 //
 // Preconditions: m must be a map.