diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 2aac1a1ef7..f420a61241 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -17,6 +17,18 @@ go_template_instance( }, ) +go_template_instance( + name = "atomicptrmap_tid_vcpu", + out = "atomicptrmap_tid_vcpu_unsafe.go", + package = "kvm", + prefix = "vCPUsByTID", + template = "//pkg/sync/atomicptrmap:generic_atomicptrmap", + types = { + "Key": "uint64", + "Value": "vCPU", + }, +) + config_setting( name = "debug_build", values = { @@ -31,6 +43,7 @@ go_library( "address_space_amd64.go", "address_space_arm64.go", "atomicptr_machine_unsafe.go", + "atomicptrmap_tid_vcpu_unsafe.go", "bluepill.go", "bluepill_allocator.go", "bluepill_amd64.go", @@ -76,6 +89,7 @@ go_library( "//pkg/context", "//pkg/cpuid", "//pkg/fd", + "//pkg/gohacks", "//pkg/hostarch", "//pkg/hostos", "//pkg/hostsyscall", @@ -130,6 +144,7 @@ go_test( "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/time", + "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -164,6 +179,7 @@ go_test( "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/time", + "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index fe161acdd0..927d2d923a 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -70,14 +70,18 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { // Copy out registers. bluepillArchExit(c, bluepillArchContext(context)) - // Return to the vCPUReady state; notify any waiters. - user := c.state.Load() & vCPUUser - switch c.state.Swap(user) { - case user | vCPUGuest: // Expected case. - case user | vCPUGuest | vCPUWaiter: + // Unset vCPUGuest and vCPUWaiter; notify any waiters. + oldState := c.state.Load() + if oldState&vCPUGuest == 0 { + throw("invalid state in kvm.bluepillGuestExit") + } + newState := oldState &^ (vCPUGuest | vCPUWaiter) + oldState = c.state.Swap(newState) + if newState&vCPUStateMask == vCPUReady { + c.machine.availableNotify() + } + if oldState&vCPUWaiter != 0 { c.notify() - default: - throw("invalid state") } } @@ -116,13 +120,14 @@ func bluepillHandler(context unsafe.Pointer) { // Sanitize the registers; interrupts must always be disabled. c := bluepillArchEnter(bluepillArchContext(context)) - // Mark this as guest mode. - switch c.state.Swap(vCPUGuest | vCPUUser) { - case vCPUUser: // Expected case. - case vCPUUser | vCPUWaiter: + // Set vCPUGuest; unset vCPUWaiter; notify waiters. + state := c.state.Load() + if state&(vCPUUser|vCPUGuest) != vCPUUser { + throw("invalid state in bluepillHandler") + } + state = c.state.Swap((state | vCPUGuest) &^ vCPUWaiter) + if state&vCPUWaiter != 0 { c.notify() - default: - throw("invalid state") } for { diff --git a/pkg/sentry/platform/kvm/config.go b/pkg/sentry/platform/kvm/config.go index 8cf8fa5c87..76d28876db 100644 --- a/pkg/sentry/platform/kvm/config.go +++ b/pkg/sentry/platform/kvm/config.go @@ -18,6 +18,16 @@ package kvm // Config sets configuration options for each platform instance. -type Config struct{} +type Config struct { + // MaxVCPUs is the maximum number of vCPUs the platform instance will + // create. If MaxVCPUs is 0, the platform will choose a reasonable default. + MaxVCPUs int +} -func (*machine) applyConfig(config *Config) error { return nil } +func (m *machine) applyConfig(config *Config) error { + if config.MaxVCPUs < 0 { + return fmt.Errorf("invalid Config.MaxVCPUs: %d", config.MaxVCPUs) + } + m.maxVCPUs = config.MaxVCPUs + return nil +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index 69070b3e88..c43e88dbb4 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -16,6 +16,7 @@ package kvm import ( + "encoding/binary" "fmt" "golang.org/x/sys/unix" @@ -76,6 +77,11 @@ type KVM struct { var ( globalOnce sync.Once globalErr error + + // vCPUStateOffset is the offset in bytes from the start of vCPU.state to + // the start of the lower 4 bytes of vCPU.state, which varies depending on + // endianness. + vCPUStateOffset uintptr ) // OpenDevice opens the KVM device and returns the File. @@ -97,6 +103,18 @@ func New(deviceFile *fd.FD, config Config) (*KVM, error) { // Ensure global initialization is done. globalOnce.Do(func() { + var buf [8]byte + binary.NativeEndian.PutUint64(buf[:], 0x1122334455667788) + switch binary.NativeEndian.Uint32(buf[0:4]) { + case 0x11223344: // big endian + vCPUStateOffset = 4 + case 0x55667788: // little endian + vCPUStateOffset = 0 + default: + globalErr = fmt.Errorf("failed to determine endianness for kvm.vCPUStateOffset") + return + } + globalErr = updateGlobalOnce(int(fd)) }) if globalErr != nil { diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index 48a501c016..dd17d0d3ba 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -18,6 +18,7 @@ import ( "math/rand" "os" "reflect" + "runtime" "testing" "time" @@ -32,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" ktime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sync" ) // dummyFPState is initialized in TestMain. @@ -90,7 +92,7 @@ func bluepillTest(t testHarness, fn func(*vCPU)) { func TestKernelSyscall(t *testing.T) { bluepillTest(t, func(c *vCPU) { redpill() // Leave guest mode. - if got := c.state.Load(); got != vCPUUser { + if got := c.state.Load(); got&vCPUStateMask != vCPUUser { t.Errorf("vCPU not in ready state: got %v", got) } }) @@ -108,7 +110,7 @@ func TestKernelFault(t *testing.T) { hostFault() // Ensure recovery works. bluepillTest(t, func(c *vCPU) { hostFault() - if got := c.state.Load(); got != vCPUUser { + if got := c.state.Load(); got&vCPUStateMask != vCPUUser { t.Errorf("vCPU not in ready state: got %v", got) } }) @@ -310,6 +312,12 @@ func TestBounceStress(t *testing.T) { time.Sleep(time.Duration(n) * time.Microsecond) } } + m := c.machine + // Lock to this thread so that m.Get() always gives us the same vCPU, + // since applicationTest() => kvmTest() assumes that we return holding + // c. + runtime.LockOSThread() + defer runtime.UnlockOSThread() for i := 0; i < 1000; i++ { // Start an asynchronously executing goroutine that // calls Bounce at pseudo-random point in time. @@ -328,9 +336,11 @@ func TestBounceStress(t *testing.T) { }, &si); err != platform.ErrContextInterrupt { t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) } - c.unlock() + m.Put(c) randomSleep() - c.lock() + if c2 := m.Get(); c != c2 { + t.Fatalf("machine.Get() changed vCPUs from %p to %p", c, c2) + } } return false }) @@ -480,6 +490,63 @@ func TestKernelVDSO(t *testing.T) { }) } +// Regression test for b/404271139. +func TestSingleVCPU(t *testing.T) { + // Create the machine. + deviceFile, err := OpenDevice("") + if err != nil { + t.Fatalf("error opening device file: %v", err) + } + k, err := New(deviceFile, Config{ + MaxVCPUs: 1, + }) + if err != nil { + t.Fatalf("error creating KVM instance: %v", err) + } + defer k.machine.Destroy() + + // Ping-pong the single vCPU between two goroutines. The test passes if + // this does not deadlock. + stopC := make(chan struct{}) + var doneWG sync.WaitGroup + defer func() { + close(stopC) + doneWG.Wait() + }() + var wakeC [2]chan struct{} + for i := range wakeC { + wakeC[i] = make(chan struct{}, 1) + } + for i := range wakeC { + doneWG.Add(1) + go func(i int) { + defer doneWG.Done() + for { + // Multiple ready channels in a select statement are chosen + // from randomly, so have a separate non-blocking receive from + // stopC first to ensure that it's honored in deterministic + // time. + select { + case <-stopC: + return + default: + } + select { + case <-stopC: + return + case <-wakeC[i]: + c := k.machine.Get() + bluepill(c) + wakeC[1-i] <- struct{}{} + k.machine.Put(c) + } + } + }(i) + } + wakeC[0] <- struct{}{} + time.Sleep(time.Second) +} + func BenchmarkApplicationSyscall(b *testing.B) { var ( i int // Iteration includes machine.Get() / machine.Put(). diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index a89dea1706..ad8aae5c76 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -57,23 +57,24 @@ type machine struct { // kernel is the set of global structures. kernel ring0.Kernel - // mu protects vCPUs. - mu sync.RWMutex - - // available is notified when vCPUs are available. - available sync.Cond - // vCPUsByTID are the machine vCPUs. // // These are populated dynamically. - vCPUsByTID map[uint64]*vCPU + vCPUsByTID vCPUsByTIDAtomicPtrMap + + // availableWaiters is the number of goroutines waiting for a vCPU to + // become ready. + availableWaiters atomicbitops.Int32 + + // availableSeq is incremented whenever a vCPU becomes ready. + availableSeq atomicbitops.Uint32 // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. vCPUsByID []*vCPU // usedVCPUs is the number of vCPUs that have been used from the // vCPUsByID pool. - usedVCPUs int + usedVCPUs atomicbitops.Int32 // maxVCPUs is the maximum number of vCPUs supported by the machine. maxVCPUs int @@ -88,20 +89,25 @@ type machine struct { usedSlots []uintptr } +// Bits in vCPU.state: const ( // vCPUReady is an alias for all the below clear. - vCPUReady uint32 = 0 + vCPUReady = 0 // vCPUser indicates that the vCPU is in or about to enter user mode. - vCPUUser uint32 = 1 << 0 + vCPUUser = 1 << 0 // vCPUGuest indicates the vCPU is in guest mode. - vCPUGuest uint32 = 1 << 1 + vCPUGuest = 1 << 1 // vCPUWaiter indicates that there is a waiter. // // If this is set, then notify must be called on any state transitions. - vCPUWaiter uint32 = 1 << 2 + vCPUWaiter = 1 << 2 + + vCPUTIDShift = 32 + vCPUStateMask = (uint64(1) << vCPUTIDShift) - 1 + vCPUTIDMask = ^vCPUStateMask ) // Field values for the get_vcpu metric acquisition path used. @@ -181,9 +187,6 @@ type vCPU struct { // fd is the vCPU fd. fd int - // tid is the last set tid. - tid atomicbitops.Uint64 - // userExits is the count of user exits. userExits atomicbitops.Uint64 @@ -193,10 +196,10 @@ type vCPU struct { // faults is a count of world faults (informational only). faults uint32 - // state is the vCPU state. - // - // This is a bitmask of the three fields (vCPU*) described above. - state atomicbitops.Uint32 + // The bottom 32 bits of state is a bitset of state fields; see vCPUUser + // and company above. If vCPUGuest or vCPUUser are set, then the top 32 + // bits of state are the thread ID of the thread that has set them. + state atomicbitops.Uint64 // runData for this vCPU. runData *runData @@ -270,7 +273,6 @@ var forceMappingEntireAddressSpace = false func newMachine(vm int, config *Config) (*machine, error) { // Create the machine. m := &machine{fd: vm} - m.available.L = &m.mu if err := m.applyConfig(config); err != nil { panic(fmt.Sprintf("error setting config parameters: %s", err)) @@ -279,7 +281,6 @@ func newMachine(vm int, config *Config) (*machine, error) { // Pull the maximum vCPUs. m.getMaxVCPU() log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) - m.vCPUsByTID = make(map[uint64]*vCPU) m.vCPUsByID = make([]*vCPU, m.maxVCPUs) m.kernel.Init(m.maxVCPUs) @@ -510,109 +511,118 @@ func (m *machine) Destroy() { // the current context in guest, the vCPU of it must be the same as what // Get() returns. func (m *machine) Get() *vCPU { - m.mu.RLock() runtime.LockOSThread() tid := hosttid.Current() // Check for an exact match. - if c := m.vCPUsByTID[tid]; c != nil { - c.lock() - m.mu.RUnlock() - getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) - return c + if c := m.vCPUsByTID.Load(tid); c != nil { + // If this succeeds, we can ignore vCPUGuest since it just indicates + // that we're already in guest mode. + if state := c.state.Load(); state>>vCPUTIDShift == tid && c.state.CompareAndSwap(state, state|vCPUUser) { + getVCPUCounter.Increment(&getVCPUAcquisitionFastReused) + return c + } } - // The happy path failed. We now proceed to acquire an exclusive lock - // (because the vCPU map may change), and scan all available vCPUs. - // In this case, we first unlock the OS thread. Otherwise, if mu is - // not available, the current system thread will be parked and a new - // system thread spawned. We avoid this situation by simply refreshing - // tid after relocking the system thread. - m.mu.RUnlock() - runtime.UnlockOSThread() - m.mu.Lock() - + // Get vCPU from the m.vCPUsByID pool. + owned := (tid << vCPUTIDShift) | vCPUUser for { - runtime.LockOSThread() - tid = hosttid.Current() - - // Recheck for an exact match. - if c := m.vCPUsByTID[tid]; c != nil { - c.lock() - m.mu.Unlock() - getVCPUCounter.Increment(&getVCPUAcquisitionReused) - return c + usedVCPUs := m.usedVCPUs.Load() + if int(usedVCPUs) >= m.maxVCPUs { + break } + // XXX switch to Add + if !m.usedVCPUs.CompareAndSwap(usedVCPUs, usedVCPUs+1) { + continue + } + c := m.vCPUsByID[usedVCPUs] + c.state.Store(owned) + m.vCPUsByTID.Store(tid, c) + c.loadSegments() + getVCPUCounter.Increment(&getVCPUAcquisitionUnused) + return c + } - // Get vCPU from the m.vCPUsByID pool. - if m.usedVCPUs < m.maxVCPUs { - c := m.vCPUsByID[m.usedVCPUs] - m.usedVCPUs++ - c.lock() - m.vCPUsByTID[tid] = c - m.mu.Unlock() - c.loadSegments(tid) + // Scan for an available vCPU. + for origTID, c := range m.vCPUsByTID.RangeRepeatable { + // We can't steal vCPUs from other threads that are in guest mode. + if state := c.state.Load(); state&vCPUStateMask == vCPUReady && c.state.CompareAndSwap(state, owned) { + m.vCPUsByTID.CompareAndSwap(origTID, c, nil) + m.vCPUsByTID.Store(tid, c) + c.loadSegments() getVCPUCounter.Increment(&getVCPUAcquisitionUnused) return c } + } - // Scan for an available vCPU. - for origTID, c := range m.vCPUsByTID { - if c.state.CompareAndSwap(vCPUReady, vCPUUser) { - delete(m.vCPUsByTID, origTID) - m.vCPUsByTID[tid] = c - m.mu.Unlock() - c.loadSegments(tid) + // Wait for an available vCPU. + m.availableWaiters.Add(1) + for { + epoch := m.availableSeq.Load() + for origTID, c := range m.vCPUsByTID.RangeRepeatable { + if state := c.state.Load(); state&vCPUStateMask == vCPUReady && c.state.CompareAndSwap(state, owned) { + m.vCPUsByTID.CompareAndSwap(origTID, c, nil) + m.vCPUsByTID.Store(tid, c) + m.availableWaiters.Add(-1) + c.loadSegments() getVCPUCounter.Increment(&getVCPUAcquisitionUnused) return c } } - // Scan for something not in user mode. - for origTID, c := range m.vCPUsByTID { - if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { - continue - } + // All vCPUs are already in guest mode. Wait until a vCPU becomes + // available. m.availableWait() blocks in the host, but unlocking the + // OS thread still makes waking up less expensive if sysmon steals our + // P while we're blocked. + runtime.UnlockOSThread() + m.availableWait(epoch) + runtime.LockOSThread() + tid = hosttid.Current() + owned = (tid << vCPUTIDShift) | vCPUUser - // The vCPU is not be able to transition to - // vCPUGuest|vCPUWaiter or to vCPUUser because that - // transition requires holding the machine mutex, as we - // do now. There is no path to register a waiter on - // just the vCPUReady state. - for { - c.waitUntilNot(vCPUGuest | vCPUWaiter) - if c.state.CompareAndSwap(vCPUReady, vCPUUser) { - break - } + // Recheck for an exact match. + if c := m.vCPUsByTID.Load(tid); c != nil { + if state := c.state.Load(); state>>vCPUTIDShift == tid && c.state.CompareAndSwap(state, state|vCPUUser) { + m.availableWaiters.Add(-1) + getVCPUCounter.Increment(&getVCPUAcquisitionReused) + return c } - - // Steal the vCPU. - delete(m.vCPUsByTID, origTID) - m.vCPUsByTID[tid] = c - m.mu.Unlock() - c.loadSegments(tid) - getVCPUCounter.Increment(&getVCPUAcquisitionStolen) - return c } - - // Everything is executing in user mode. Wait until something is - // available. As with m.mu.Lock() above, unlock the OS thread while we - // do this to avoid spawning additional system threads. Note that - // signaling the condition variable will have the extra effect of - // kicking the vCPUs out of guest mode if that's where they were. - runtime.UnlockOSThread() - m.available.Wait() } } // Put puts the current vCPU. func (m *machine) Put(c *vCPU) { - c.unlock() - runtime.UnlockOSThread() + // Fast path: + if c.stateLower().CompareAndSwap(vCPUUser|vCPUGuest, vCPUGuest) { + runtime.UnlockOSThread() + return + } - m.mu.RLock() - m.available.Signal() - m.mu.RUnlock() + var ( + oldState uint64 + newState uint64 + ) + // Unset vCPUUser and vCPUWaiter; notify waiters. This needs to CAS since + // we might leave guest mode between c.state.Load() and + // c.state.CompareAndSwap(). + for { + oldState = c.state.Load() + if oldState&vCPUUser == 0 { + panic("putting vCPU not locked by Get") + } + newState = oldState &^ (vCPUUser | vCPUWaiter) + if c.state.CompareAndSwap(oldState, newState) { + break + } + } + runtime.UnlockOSThread() + if newState&vCPUStateMask == vCPUReady { + m.availableNotify() + } + if oldState&vCPUWaiter != 0 { + c.notify() + } } // newDirtySet returns a new dirty set. @@ -625,9 +635,6 @@ func (m *machine) newDirtySet() *dirtySet { // dropPageTables drops cached page table entries. func (m *machine) dropPageTables(pt *pagetables.PageTables) { - m.mu.Lock() - defer m.mu.Unlock() - // Clear from all PCIDs. for _, c := range m.vCPUsByID { if c != nil && c.PCIDs != nil { @@ -636,57 +643,6 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) { } } -// lock marks the vCPU as in user mode. -// -// This should only be called directly when known to be safe, i.e. when -// the vCPU is owned by the current TID with no chance of theft. -// -//go:nosplit -func (c *vCPU) lock() { - atomicbitops.OrUint32(&c.state, vCPUUser) -} - -// unlock clears the vCPUUser bit. -// -//go:nosplit -func (c *vCPU) unlock() { - origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) - if origState == vCPUUser|vCPUGuest { - // Happy path: no exits are forced, and we can continue - // executing on our merry way with a single atomic access. - return - } - - // Clear the lock. - for { - state := atomicbitops.CompareAndSwapUint32(&c.state, origState, origState&^vCPUUser) - if state == origState { - break - } - origState = state - } - switch origState { - case vCPUUser: - // Normal state. - case vCPUUser | vCPUGuest | vCPUWaiter: - // Force a transition: this must trigger a notification when we - // return from guest mode. We must clear vCPUWaiter here - // anyways, because BounceToKernel will force a transition only - // from ring3 to ring0, which will not clear this bit. Halt may - // workaround the issue, but if there is no exception or - // syscall in this period, BounceToKernel will hang. - atomicbitops.AndUint32(&c.state, ^vCPUWaiter) - c.notify() - case vCPUUser | vCPUWaiter: - // Waiting for the lock to be released; the responsibility is - // on us to notify the waiter and clear the associated bit. - atomicbitops.AndUint32(&c.state, ^vCPUWaiter) - c.notify() - default: - panic("invalid state") - } -} - // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. // //go:nosplit @@ -704,7 +660,7 @@ func (c *vCPU) bounce(forceGuestExit bool) { origGuestExits := c.guestExits.Load() origUserExits := c.userExits.Load() for { - switch state := c.state.Load(); state { + switch state := c.state.Load(); state & vCPUStateMask { case vCPUReady, vCPUWaiter: // There is nothing to be done, we're already in the // kernel pre-acquisition. The Bounce criteria have @@ -721,7 +677,7 @@ func (c *vCPU) bounce(forceGuestExit bool) { // come from the bluepill handler. c.waitUntilNot(state) case vCPUGuest, vCPUUser | vCPUGuest: - if state == vCPUGuest && !forceGuestExit { + if state&vCPUStateMask == vCPUGuest && !forceGuestExit { // The vCPU is already not acquired, so there's // no need to do a fresh injection here. return @@ -737,7 +693,7 @@ func (c *vCPU) bounce(forceGuestExit bool) { // under memory pressure. Since we already // marked ourselves as a waiter, we need to // ensure that a signal is actually delivered. - if err := unix.Tgkill(pid, int(c.tid.Load()), bounceSignal); err == nil { + if err := unix.Tgkill(pid, int(state>>vCPUTIDShift), bounceSignal); err == nil { break } else if err.(unix.Errno) == unix.EAGAIN { continue @@ -747,7 +703,7 @@ func (c *vCPU) bounce(forceGuestExit bool) { } } case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: - if state == vCPUGuest|vCPUWaiter && !forceGuestExit { + if state&vCPUStateMask == vCPUGuest|vCPUWaiter && !forceGuestExit { // See above. return } @@ -756,7 +712,7 @@ func (c *vCPU) bounce(forceGuestExit bool) { c.waitUntilNot(state) default: // Should not happen: the above is exhaustive. - panic("invalid state") + panic("invalid state in kvm.vCPU.bounce") } // Check if we've missed the state transition, but diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 342013d90b..acc854f1de 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -49,11 +49,9 @@ func (m *machine) initArchState() error { } // Initialize all vCPUs to minimize kvm ioctl-s allowed by seccomp filters. - m.mu.Lock() for i := 0; i < m.maxVCPUs; i++ { m.createVCPU(i) } - m.mu.Unlock() return nil } @@ -497,21 +495,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { + if m.maxVCPUs == 0 { + // The goal here is to avoid vCPU contentions for reasonable workloads. + // But "reasonable" isn't defined well in this case. Let's say that CPU + // overcommit with factor 2 is still acceptable. We allocate a set of + // vCPU for each goruntime processor (P) and two sets of vCPUs to run + // user code. + m.maxVCPUs = 3 * runtime.GOMAXPROCS(0) + } + + // Apply KVM limit. maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) if errno != 0 { - m.maxVCPUs = _KVM_NR_VCPUS - } else { - m.maxVCPUs = int(maxVCPUs) + maxVCPUs = _KVM_NR_VCPUS } - - // The goal here is to avoid vCPU contentions for reasonable workloads. - // But "reasonable" isn't defined well in this case. Let's say that CPU - // overcommit with factor 2 is still acceptable. We allocate a set of - // vCPU for each goruntime processor (P) and two sets of vCPUs to run - // user code. - rCPUs := runtime.GOMAXPROCS(0) - if 3*rCPUs < m.maxVCPUs { - m.maxVCPUs = 3 * rCPUs + if m.maxVCPUs > int(maxVCPUs) { + m.maxVCPUs = int(maxVCPUs) } } diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 32a79939af..d17f7a26b8 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -35,7 +35,7 @@ func rdgsbase() uint64 // This may be called from within the signal context and throws on error. // //go:nosplit -func (c *vCPU) loadSegments(tid uint64) { +func (c *vCPU) loadSegments() { if errno := hostsyscall.RawSyscallErrno(unix.SYS_SIGALTSTACK, 0, uintptr(unsafe.Pointer(&c.signalStack)), 0); errno != 0 { throw("sigaltstack") } @@ -58,7 +58,6 @@ func (c *vCPU) loadSegments(tid uint64) { throw("getting GS segment") } } - c.tid.Store(tid) } // setCPUID sets the CPUID to be used by the guest. diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index bcc7fb7760..f60e8fff60 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -185,16 +185,12 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { - rmaxVCPUs := runtime.NumCPU() - smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) - // compare the max vcpu number from runtime and syscall, use smaller one. - if errno != 0 { - m.maxVCPUs = rmaxVCPUs - } else { - if rmaxVCPUs < int(smaxVCPUs) { - m.maxVCPUs = rmaxVCPUs - } else { - m.maxVCPUs = int(smaxVCPUs) - } + if m.maxVCPUs == 0 { + m.maxVCPUs = runtime.NumCPU() + } + + // Apply KVM limit. + if maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS); errno == 0 && m.maxVCPUs > int(maxVCPUs) { + m.maxVCPUs = int(maxVCPUs) } } diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 946129da89..6727f70f51 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -53,11 +53,9 @@ func (m *machine) initArchState() error { // The reason for the difference is that ARM64 and x86_64 have different KVM timer mechanisms. // If we create vCPU dynamically on ARM64, the timer for vCPU would mess up for a short time. // For more detail, please refer to https://github.com/google/gvisor/issues/5739 - m.mu.Lock() for i := 0; i < m.maxVCPUs; i++ { m.createVCPU(i) } - m.mu.Unlock() return nil } @@ -259,10 +257,9 @@ func (c *vCPU) setSystemTime() error { } //go:nosplit -func (c *vCPU) loadSegments(tid uint64) { +func (c *vCPU) loadSegments() { // TODO(gvisor.dev/issue/1238): TLS is not supported. // Get TLS from tpidr_el0. - c.tid.Store(tid) } func (c *vCPU) setOneRegister(reg *kvmOneReg) error { diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index cf2e36c5c7..15808b643f 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -114,6 +114,49 @@ func (a *atomicAddressSpace) get() *addressSpace { return (*addressSpace)(atomic.LoadPointer(&a.pointer)) } +// availableNotify is called when a vCPU's state transitions to vCPUReady. +// +//go:nosplit +func (m *machine) availableNotify() { + m.availableSeq.Add(1) + if m.availableWaiters.Load() == 0 { + return + } + errno := hostsyscall.RawSyscallErrno( // escapes: no. + unix.SYS_FUTEX, + uintptr(unsafe.Pointer(&m.availableSeq)), + linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, + 1) + if errno != 0 { + throw("futex wake error") + } +} + +// availableWait blocks until availableNotify is called. +// +// Preconditions: +// - epoch was the value of m.availableSeq before the caller last checked that +// no vCPUs were in state vCPUReady. +// - m.availableWaiters must be non-zero. +// +//go:nosplit +func (m *machine) availableWait(epoch uint32) { + _, _, errno := unix.Syscall6( + unix.SYS_FUTEX, + uintptr(unsafe.Pointer(&m.availableSeq)), + linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG, + uintptr(epoch), + 0, 0, 0) + if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN { + panic("futex wait error") + } +} + +// stateLower returns a pointer to the lower 32 bits of c.state. +func (c *vCPU) stateLower() *atomicbitops.Uint32 { + return (*atomicbitops.Uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&c.state)) + vCPUStateOffset)) +} + // notify notifies that the vCPU has transitioned modes. // // This may be called by a signal handler and therefore throws on error. @@ -122,7 +165,7 @@ func (a *atomicAddressSpace) get() *addressSpace { func (c *vCPU) notify() { errno := hostsyscall.RawSyscallErrno( // escapes: no. unix.SYS_FUTEX, - uintptr(unsafe.Pointer(&c.state)), + uintptr(unsafe.Pointer(c.stateLower())), linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, // Number of waiters. math.MaxInt32) @@ -137,12 +180,12 @@ func (c *vCPU) notify() { // appropriate action to cause a transition (e.g. interrupt injection). // // This panics on error. -func (c *vCPU) waitUntilNot(state uint32) { +func (c *vCPU) waitUntilNot(state uint64) { _, _, errno := unix.Syscall6( unix.SYS_FUTEX, - uintptr(unsafe.Pointer(&c.state)), + uintptr(unsafe.Pointer(c.stateLower())), linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG, - uintptr(state), + uintptr(uint32(state)), 0, 0, 0) if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN { panic("futex wait error") diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index 2769ab5a92..b9381e69e6 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -17,6 +17,7 @@ package vfs import ( "fmt" "math/bits" + "math/rand/v2" "sync/atomic" "unsafe" @@ -38,7 +39,7 @@ type mountKey struct { var ( mountKeyHasher = sync.MapKeyHasher(map[mountKey]struct{}(nil)) - mountKeySeed = sync.RandUintptr() + mountKeySeed = uintptr(rand.Uint64()) ) func (k *mountKey) hash() uintptr { diff --git a/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go b/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go index 8324d4c0fc..646ed6caff 100644 --- a/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go +++ b/pkg/sync/atomicptrmap/generic_atomicptrmap_unsafe.go @@ -17,6 +17,7 @@ package atomicptrmap import ( + "math/rand/v2" "sync/atomic" "unsafe" @@ -56,7 +57,7 @@ type defaultHasher struct { // Init initializes the Hasher. func (h *defaultHasher) Init() { h.fn = sync.MapKeyHasher(map[Key]*Value(nil)) - h.seed = sync.RandUintptr() + h.seed = uintptr(rand.Uint64()) } // Hash returns the hash value for the given Key. @@ -430,15 +431,18 @@ func (shard *apmShard) rehash(oldSlots unsafe.Pointer) { // // f must not call other methods on m. func (m *AtomicPtrMap) Range(f func(key Key, val *Value) bool) { + // Randomize start position for consistency with Go maps. + start := uintptr(rand.Uint64()) + for si := 0; si < len(m.shards); si++ { shard := &m.shards[si] - if !shard.doRange(f) { + if !shard.doRange(f, start) { return } } } -func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool { +func (shard *apmShard) doRange(f func(key Key, val *Value) bool, start uintptr) bool { // We have to lock rehashMu because if we handled races with rehashing by // retrying, f could see the same key twice. shard.rehashMu.Lock() @@ -448,14 +452,19 @@ func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool { return true } mask := shard.mask - for i := uintptr(0); i <= mask; i++ { + start &= mask + i := start + for { slot := apmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) - if slotVal == nil || slotVal == tombstone() { - continue + if slotVal != nil && slotVal != tombstone() { + if !f(slot.key, (*Value)(slotVal)) { + return false + } } - if !f(slot.key, (*Value)(slotVal)) { - return false + i = (i + 1) & mask + if i == start { + break } } return true @@ -469,6 +478,9 @@ func (shard *apmShard) doRange(f func(key Key, val *Value) bool) bool { // // - It is safe for f to call other methods on m. func (m *AtomicPtrMap) RangeRepeatable(f func(key Key, val *Value) bool) { + // Randomize start position for consistency with Go maps. + start := uintptr(rand.Uint64()) + for si := 0; si < len(m.shards); si++ { shard := &m.shards[si] @@ -483,17 +495,22 @@ func (m *AtomicPtrMap) RangeRepeatable(f func(key Key, val *Value) bool) { continue } - for i := uintptr(0); i <= mask; i++ { + startMask := start & mask + i := startMask + for { slot := apmSlotAt(slots, i) slotVal := atomic.LoadPointer(&slot.val) if slotVal == evacuated() { goto retry } - if slotVal == nil || slotVal == tombstone() { - continue + if slotVal != nil && slotVal != tombstone() { + if !f(slot.key, (*Value)(slotVal)) { + return + } } - if !f(slot.key, (*Value)(slotVal)) { - return + i = (i + 1) & mask + if i == startMask { + break } } } diff --git a/pkg/sync/runtime_unsafe.go b/pkg/sync/runtime_unsafe.go index 5bc0a92e07..8c5882da74 100644 --- a/pkg/sync/runtime_unsafe.go +++ b/pkg/sync/runtime_unsafe.go @@ -73,27 +73,6 @@ func Goready(gp uintptr, traceskip int, wakep bool) { } } -// Rand32 returns a non-cryptographically-secure random uint32. -func Rand32() uint32 { - return fastrand() -} - -// Rand64 returns a non-cryptographically-secure random uint64. -func Rand64() uint64 { - return uint64(fastrand())<<32 | uint64(fastrand()) -} - -//go:linkname fastrand runtime.fastrand -func fastrand() uint32 - -// RandUintptr returns a non-cryptographically-secure random uintptr. -func RandUintptr() uintptr { - if unsafe.Sizeof(uintptr(0)) == 4 { - return uintptr(Rand32()) - } - return uintptr(Rand64()) -} - // MapKeyHasher returns a hash function for pointers of m's key type. // // Preconditions: m must be a map.