diff --git a/CHANGELOG.md b/CHANGELOG.md index 5558cdf3795..27bee5305d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,24 @@ and this project adheres to ### Added +- [#5510](https://github.com/firecracker-microvm/firecracker/pull/5510), + [#5593](https://github.com/firecracker-microvm/firecracker/pull/5593), + [#5564](https://github.com/firecracker-microvm/firecracker/pull/5564): Add + support for the + [VMClock device](https://uapi-group.org/specifications/specs/vmclock). The + implementation supports the snapshot safety features proposed + [here](https://lore.kernel.org/lkml/20260107132514.437-1-bchalios@amazon.es/), + but doesn't provide currently any clock-specific information for helping the + guest synchronize its clocks. More information can be found in + [docs](docs/snapshotting/snapshot-support.md#userspace-notifications-of-loading-virtual-machine-snapshots). + ### Changed +- [#5564](https://github.com/firecracker-microvm/firecracker/pull/5564): which + added support for VMClock, uses one extra GSI for the VMClock device itself + which reduces the available GSIs for VirtIO devices. New maximum values is 92 + devices on Aarch64 and 17 devices on x86. + ### Deprecated ### Removed diff --git a/docs/snapshotting/snapshot-support.md b/docs/snapshotting/snapshot-support.md index 6e1ac4d4c35..6f47d2f15e5 100644 --- a/docs/snapshotting/snapshot-support.md +++ b/docs/snapshotting/snapshot-support.md @@ -24,6 +24,7 @@ - [Snapshot security and uniqueness](#snapshot-security-and-uniqueness) - [Secure and insecure usage examples](#usage-examples) - [Reusing snapshotted states securely](#reusing-snapshotted-states-securely) + - [Userspace notifications of loading Virtual Machine snapshots](#userspace-notifications-of-loading-virtual-machine-snapshots) - [Vsock device limitation](#vsock-device-limitation) - [VMGenID device limitation](#vmgenid-device-limitation) - [Where can I resume my snapshots?](#where-can-i-resume-my-snapshots) @@ -590,6 +591,41 @@ identifiers, cached random numbers, cryptographic tokens, etc **will** still be replicated across multiple microVMs resumed from the same snapshot. Users need to implement mechanisms for ensuring de-duplication of such state, where needed. +## Userspace notifications of loading Virtual Machine snapshots + +VMClock device +([specification](https://uapi-group.org/specifications/specs/vmclock/)) is a +device that enables efficient application clock synchronization against real +wallclock time, for applications running inside Virtual Machines. VMCLock also +takes care situations where there is some sort disruption happens to the clock. +It handles these through fields in the +[`vmlcock_abi`](https://uapi-group.org/specifications/specs/vmclock/#the-vmclock_abi-structure). +Currently, it handles two cases: + +1. Live migration through the `disruption_marker` field. +1. Virtual machine snapshots through the `vm_generation_counter`. + +Whenever a VM starts from a snapshot VMClock will present a new (different that +what was previously stored) value in the `vm_generation_counter`. This happens +in an atomic way, i.e. `vm_generation_counter` will include the new value as +soon as vCPUs are resumed post snapshot loading. + +User space libraries, e.g. userspace PRNGs can mmap() `vmclock_abi` and monitor +changes in `vm_generation_counter` to observe when they need to adapt and/or +recreate state. + +Moreover, VMClock allows processes to call poll() on the VMClock device and get +notified about changes through an event loop. + +> [!IMPORTANT] Support for `vm_generation_counter` and `poll()` is implemented +> in Linux through the patches +> [here](https://lore.kernel.org/lkml/20260107132514.437-1-bchalios@amazon.es/). +> We have backported these patches for AL kernels +> [here](../../resources/patches/vmclock) 5.10 and 6.1 kernels. Using the +> kernels suggested from the [Getting Started Guide](../getting-started.md) +> includes these patches. When using mainline kernels users need to make sure +> that they apply the linked patches, until these get merged upstream. + ## Vsock device reset The vsock device is reset across snapshot/restore to avoid inconsistent state diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 949435e6c83..5f98431b1d9 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -20,6 +20,7 @@ use crate::arch::{ use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevices; +use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestRegionType}; @@ -97,6 +98,7 @@ pub fn create_fdt( create_psci_node(&mut fdt_writer)?; create_devices_node(&mut fdt_writer, device_manager)?; create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_vmclock_node(&mut fdt_writer, &device_manager.acpi_devices.vmclock)?; create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. @@ -287,6 +289,18 @@ fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &VmGenId) -> Result<(), Fdt Ok(()) } +fn create_vmclock_node(fdt: &mut FdtWriter, vmclock: &VmClock) -> Result<(), FdtError> { + let vmclock_node = fdt.begin_node(&format!("ptp@{}", vmclock.guest_address.0))?; + fdt.property_string("compatible", "amazon,vmclock")?; + fdt.property_array_u64("reg", &[vmclock.guest_address.0, VMCLOCK_SIZE as u64])?; + fdt.property_array_u32( + "interrupts", + &[GIC_FDT_IRQ_TYPE_SPI, vmclock.gsi, IRQ_TYPE_EDGE_RISING], + )?; + fdt.end_node(vmclock_node)?; + Ok(()) +} + fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), FdtError> { let interrupt = fdt.begin_node("intc")?; fdt.property_string("compatible", gic_device.fdt_compatibility())?; diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 979cd68a285..d1004096059 100644 Binary files a/src/vmm/src/arch/aarch64/output_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_GICv3.dtb differ diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index 63ab6765036..9477bb72d17 100644 Binary files a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb differ diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 97f2e92693f..36be5dd6c47 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -288,7 +288,6 @@ pub fn build_microvm_for_boot( )?; device_manager.attach_vmgenid_device(&vm)?; - #[cfg(target_arch = "x86_64")] device_manager.attach_vmclock_device(&vm)?; #[cfg(target_arch = "aarch64")] diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 9764143b5a9..944d266c53c 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -1,11 +1,11 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +#[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; use vm_memory::GuestMemoryError; use crate::Vm; -#[cfg(target_arch = "x86_64")] use crate::devices::acpi::vmclock::VmClock; use crate::devices::acpi::vmgenid::VmGenId; use crate::vstate::resources::ResourceAllocator; @@ -23,7 +23,6 @@ pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: VmGenId, /// VMclock device - #[cfg(target_arch = "x86_64")] pub vmclock: VmClock, } @@ -32,7 +31,6 @@ impl ACPIDeviceManager { pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { ACPIDeviceManager { vmgenid: VmGenId::new(resource_allocator), - #[cfg(target_arch = "x86_64")] vmclock: VmClock::new(resource_allocator), } } @@ -43,19 +41,19 @@ impl ACPIDeviceManager { Ok(()) } - #[cfg(target_arch = "x86_64")] pub fn attach_vmclock(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + vm.register_irq(&self.vmclock.interrupt_evt, self.vmclock.gsi)?; self.vmclock.activate(vm.guest_memory())?; Ok(()) } } +#[cfg(target_arch = "x86_64")] impl Aml for ACPIDeviceManager { fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { // AML for [`VmGenId`] device. self.vmgenid.append_aml_bytes(v)?; // AML for [`VmClock`] device. - #[cfg(target_arch = "x86_64")] self.vmclock.append_aml_bytes(v)?; // Create the AML for the GED interrupt handler @@ -65,30 +63,37 @@ impl Aml for ACPIDeviceManager { &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, &aml::Name::new( "_CRS".try_into()?, - &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( - true, - true, - false, - false, - self.vmgenid.gsi, - )]), + &aml::ResourceTemplate::new(vec![ + &aml::Interrupt::new(true, true, false, false, self.vmgenid.gsi), + &aml::Interrupt::new(true, true, false, false, self.vmclock.gsi), + ]), )?, + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in ARM (look into `vmm::crate::arch::layout::GSI_LEGACY_END`). + // Both `vmgenid.gsi` and `vmclock.gsi` can safely be cast to `u8` + // without truncation, so we let clippy know. &aml::Method::new( "_EVT".try_into()?, 1, true, - vec![&aml::If::new( - // We know that the maximum IRQ number fits in a u8. We have up to - // 32 IRQs in x86 and up to 128 in - // ARM (look into - // `vmm::crate::arch::layout::GSI_LEGACY_END`) - #[allow(clippy::cast_possible_truncation)] - &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), - vec![&aml::Notify::new( - &aml::Path::new("\\_SB_.VGEN")?, - &0x80usize, - )], - )], + vec![ + &aml::If::new( + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VGEN")?, + &0x80usize, + )], + ), + &aml::If::new( + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmclock.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VCLK")?, + &0x80usize, + )], + ), + ], ), ], ) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index d91bd20e98e..8a0751a61d6 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -237,7 +237,6 @@ impl DeviceManager { Ok(()) } - #[cfg(target_arch = "x86_64")] pub(crate) fn attach_vmclock_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { self.acpi_devices.attach_vmclock(vm)?; Ok(()) @@ -465,6 +464,9 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let mut acpi_devices = ACPIDeviceManager::restore(constructor_args.vm, &state.acpi_state)?; acpi_devices.vmgenid.notify_guest()?; + acpi_devices + .vmclock + .post_load_update(constructor_args.vm.guest_memory()); // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 41111b557e6..d0941349673 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -15,7 +15,6 @@ use super::mmio::*; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::device_manager::acpi::ACPIDeviceError; -#[cfg(target_arch = "x86_64")] use crate::devices::acpi::vmclock::{VmClock, VmClockState}; use crate::devices::acpi::vmgenid::{VMGenIDState, VmGenId}; #[cfg(target_arch = "aarch64")] @@ -168,7 +167,6 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { vmgenid: VMGenIDState, - #[cfg(target_arch = "x86_64")] vmclock: VmClockState, } @@ -180,7 +178,6 @@ impl<'a> Persist<'a> for ACPIDeviceManager { fn save(&self) -> Self::State { ACPIDeviceManagerState { vmgenid: self.vmgenid.save(), - #[cfg(target_arch = "x86_64")] vmclock: self.vmclock.save(), } } @@ -190,10 +187,14 @@ impl<'a> Persist<'a> for ACPIDeviceManager { // Safe to unwrap() here, this will never return an error. vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), // Safe to unwrap() here, this will never return an error. - #[cfg(target_arch = "x86_64")] - vmclock: VmClock::restore(vm.guest_memory(), &state.vmclock).unwrap(), + vmclock: VmClock::restore((), &state.vmclock).unwrap(), }; + vm.register_irq( + &acpi_devices.vmclock.interrupt_evt, + acpi_devices.vmclock.gsi, + )?; + acpi_devices.attach_vmgenid(vm)?; Ok(acpi_devices) } diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs index 134c8393f0c..80228ad848b 100644 --- a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs @@ -38,6 +38,8 @@ pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; +pub const VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: u64 = 256; +pub const VMCLOCK_FLAG_NOTIFICATION_PRESENT: u64 = 512; pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; @@ -153,10 +155,11 @@ pub struct vmclock_abi { pub time_frac_sec: __le64, pub time_esterror_nanosec: __le64, pub time_maxerror_nanosec: __le64, + pub vm_generation_counter: __le64, } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { - ["Size of vmclock_abi"][::std::mem::size_of::() - 104usize]; + ["Size of vmclock_abi"][::std::mem::size_of::() - 112usize]; ["Alignment of vmclock_abi"][::std::mem::align_of::() - 8usize]; ["Offset of field: vmclock_abi::magic"][::std::mem::offset_of!(vmclock_abi, magic) - 0usize]; ["Offset of field: vmclock_abi::size"][::std::mem::offset_of!(vmclock_abi, size) - 4usize]; @@ -198,4 +201,6 @@ const _: () = { [::std::mem::offset_of!(vmclock_abi, time_esterror_nanosec) - 88usize]; ["Offset of field: vmclock_abi::time_maxerror_nanosec"] [::std::mem::offset_of!(vmclock_abi, time_maxerror_nanosec) - 96usize]; + ["Offset of field: vmclock_abi::vm_generation_counter"] + [::std::mem::offset_of!(vmclock_abi, vm_generation_counter) - 104usize]; }; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index d7882a78ded..11a7b07cf42 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -6,14 +6,19 @@ use std::mem::offset_of; use std::sync::atomic::{Ordering, fence}; use acpi_tables::{Aml, aml}; -use log::error; +use log::{debug, error}; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryError}; +use vm_superio::Trigger; +use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::acpi::generated::vmclock_abi::{ - VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, + VMCLOCK_COUNTER_INVALID, VMCLOCK_FLAG_NOTIFICATION_PRESENT, + VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, }; +use crate::devices::legacy::EventFdTrigger; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::resources::ResourceAllocator; @@ -22,7 +27,7 @@ use crate::vstate::resources::ResourceAllocator; unsafe impl ByteValued for vmclock_abi {} // We are reserving a physical page to expose the [`VmClock`] data -const VMCLOCK_SIZE: u32 = 0x1000; +pub const VMCLOCK_SIZE: u32 = 0x1000; // Write a value in `vmclock_abi` both in the Firecracker-managed state // and inside guest memory address that corresponds to it. @@ -47,6 +52,10 @@ macro_rules! write_vmclock_field { pub struct VmClock { /// Guest address in which we will write the VMclock struct pub guest_address: GuestAddress, + /// Interrupt line for notifying the device about changes + pub interrupt_evt: EventFdTrigger, + /// GSI number allocated for the device. + pub gsi: u32, /// The [`VmClock`] state we are exposing to the guest inner: vmclock_abi, } @@ -62,17 +71,29 @@ impl VmClock { ) .expect("vmclock: could not allocate guest memory for device"); + let gsi = resource_allocator + .allocate_gsi_legacy(1) + .expect("vmclock: Could not allocate GSI for VMClock: {err}")[0]; + + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .expect("vmclock: Could not create EventFd for VMClock device: {err}"), + ); + let mut inner = vmclock_abi { magic: VMCLOCK_MAGIC, size: VMCLOCK_SIZE, version: 1, clock_status: VMCLOCK_STATUS_UNKNOWN, counter_id: VMCLOCK_COUNTER_INVALID, + flags: VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT | VMCLOCK_FLAG_NOTIFICATION_PRESENT, ..Default::default() }; VmClock { guest_address: GuestAddress(addr), + interrupt_evt, + gsi, inner, } } @@ -98,11 +119,22 @@ impl VmClock { self.inner.disruption_marker.wrapping_add(1) ); - // This fence ensures guest sees the `disruption_marker` update. It is matched to a - // read barrier in the guest. + write_vmclock_field!( + self, + mem, + vm_generation_counter, + self.inner.vm_generation_counter.wrapping_add(1) + ); + + // This fence ensures guest sees the `disruption_marker` and `vm_generation_counter` + // updates. It is matched to a read barrier in the guest. fence(Ordering::Release); write_vmclock_field!(self, mem, seq_count, self.inner.seq_count.wrapping_add(1)); + self.interrupt_evt + .trigger() + .expect("vmclock: could not send guest notification: {err}"); + debug!("vmclock: notifying guest about VMClock updates"); } } @@ -113,31 +145,36 @@ impl VmClock { pub struct VmClockState { /// Guest address in which we write the [`VmClock`] info pub guest_address: u64, + /// GSI used for notifying the guest about device changes + pub gsi: u32, /// Data we expose to the guest pub inner: vmclock_abi, } impl<'a> Persist<'a> for VmClock { type State = VmClockState; - type ConstructorArgs = &'a GuestMemoryMmap; + type ConstructorArgs = (); type Error = Infallible; fn save(&self) -> Self::State { VmClockState { guest_address: self.guest_address.0, + gsi: self.gsi, inner: self.inner, } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { + fn restore(vm: Self::ConstructorArgs, state: &Self::State) -> Result { + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .expect("vmclock: Could not create EventFd for VMClock device: {err}"), + ); let mut vmclock = VmClock { guest_address: GuestAddress(state.guest_address), + interrupt_evt, + gsi: state.gsi, inner: state.inner, }; - vmclock.post_load_update(constructor_args); Ok(vmclock) } } @@ -174,14 +211,20 @@ impl Aml for VmClock { #[cfg(test)] mod tests { use vm_memory::{Bytes, GuestAddress}; + use vmm_sys_util::tempfile::TempFile; - use crate::arch; + use crate::Vm; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::layout; + use crate::arch::{self, Kvm}; use crate::devices::acpi::generated::vmclock_abi::vmclock_abi; use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; - use crate::snapshot::Persist; + use crate::devices::virtio::test_utils::default_mem; + use crate::snapshot::{Persist, Snapshot}; use crate::test_utils::single_region_mem; use crate::utils::u64_to_usize; use crate::vstate::resources::ResourceAllocator; + use crate::vstate::vm::tests::setup_vm_with_memory; // We are allocating memory from the end of the system memory portion const VMCLOCK_TEST_GUEST_ADDR: GuestAddress = @@ -211,15 +254,17 @@ mod tests { #[test] fn test_device_save_restore() { let vmclock = default_vmclock(); + // We're using memory inside the system memory portion of the guest RAM. So we need a + // memory region that includes it. let mem = single_region_mem( u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), ); vmclock.activate(&mem).unwrap(); - let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); let state = vmclock.save(); - let vmclock_new = VmClock::restore(&mem, &state).unwrap(); + let mut vmclock_new = VmClock::restore((), &state).unwrap(); + vmclock_new.post_load_update(&mem); let guest_data_new: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); assert_ne!(guest_data_new, vmclock.inner); @@ -228,5 +273,9 @@ mod tests { vmclock.inner.disruption_marker + 1, vmclock_new.inner.disruption_marker ); + assert_eq!( + vmclock.inner.vm_generation_counter + 1, + vmclock_new.inner.vm_generation_counter + ); } } diff --git a/tests/host_tools/vmclock-abi.h b/tests/host_tools/vmclock-abi.h index 2d99b29ac44..5c707e263cb 100644 --- a/tests/host_tools/vmclock-abi.h +++ b/tests/host_tools/vmclock-abi.h @@ -115,6 +115,17 @@ struct vmclock_abi { * bit again after the update, using the about-to-be-valid fields. */ #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + /* + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will + * bump the vm_generation_counter field every time the guest is + * loaded from some save state (restored from a snapshot). + */ +#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + /* + * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send + * a notification every time it updates seq_count to a new even number. + */ +#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) __u8 pad[2]; __u8 clock_status; @@ -177,6 +188,19 @@ struct vmclock_abi { __le64 time_frac_sec; /* Units of 1/2^64 of a second */ __le64 time_esterror_nanosec; __le64 time_maxerror_nanosec; + + /* + * This field changes to another non-repeating value when the VM + * is loaded from a snapshot. This event, typically, represents a + * "jump" forward in time. As a result, in this case as well, the + * guest needs to discard any calibrarion against external sources. + * Loading a snapshot in a VM has different semantics than other VM + * events such as live migration, i.e. apart from re-adjusting guest + * clocks a guest user space might want to discard UUIDs, reset + * network connections or reseed entropy, etc. As a result, we + * use a dedicated marker for such events. + */ + __le64 vm_generation_counter; }; #endif /* __VMCLOCK_ABI_H__ */ diff --git a/tests/host_tools/vmclock.c b/tests/host_tools/vmclock.c index d69304ac87c..a2a0dd3bfb0 100644 --- a/tests/host_tools/vmclock.c +++ b/tests/host_tools/vmclock.c @@ -1,12 +1,12 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -#include #include #include #include #include #include +#include #include #include #include @@ -16,23 +16,26 @@ const char *VMCLOCK_DEV_PATH = "/dev/vmclock0"; -int get_vmclock_handle(struct vmclock_abi **vmclock) +int open_vmclock(void) { int fd = open(VMCLOCK_DEV_PATH, 0); - if (fd == -1) - goto out_err; + if (fd == -1) { + perror("open"); + exit(1); + } - void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); - if (ptr == MAP_FAILED) - goto out_err_mmap; + return fd; +} - *vmclock = ptr; - return 0; +struct vmclock_abi *get_vmclock_handle(int fd) +{ + void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + exit(1); + } -out_err_mmap: - close(fd); -out_err: - return errno; + return ptr; } #define READ_VMCLOCK_FIELD_FN(type, field) \ @@ -56,23 +59,129 @@ type read##_##field (struct vmclock_abi *vmclock) { \ } READ_VMCLOCK_FIELD_FN(uint64_t, disruption_marker); +READ_VMCLOCK_FIELD_FN(uint64_t, vm_generation_counter); -int main() +/* + * Read `vmclock_abi` structure using a file descriptor pointing to + * `/dev/vmclock0`. + */ +void read_vmclock(int fd, struct vmclock_abi *vmclock) { - struct vmclock_abi *vmclock; + int ret; - int err = get_vmclock_handle(&vmclock); - if (err) { - printf("Could not mmap vmclock struct: %s\n", strerror(err)); + /* + * Use `pread()`, since the device doesn't implement lseek(), so + * we can't reset `fp`. + */ + ret = pread(fd, vmclock, sizeof(*vmclock), 0); + if (ret < 0) { + perror("read"); + exit(1); + } else if (ret < (int) sizeof(*vmclock)) { + fprintf(stderr, "We don't handle partial writes (%d). Exiting!\n", ret); exit(1); } +} + +void print_vmclock(struct vmclock_abi *vmclock) +{ + int has_vm_gen_counter = vmclock->flags & VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT; + int has_notifications = vmclock->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT; + printf("VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: %s\n", has_vm_gen_counter ? "true" : "false"); + printf("VMCLOCK_FLAG_NOTIFICATION_PRESENT: %s\n", has_notifications ? "true" : "false"); printf("VMCLOCK_MAGIC: 0x%x\n", vmclock->magic); printf("VMCLOCK_SIZE: 0x%x\n", vmclock->size); printf("VMCLOCK_VERSION: %u\n", vmclock->version); printf("VMCLOCK_CLOCK_STATUS: %u\n", vmclock->clock_status); printf("VMCLOCK_COUNTER_ID: %u\n", vmclock->counter_id); printf("VMCLOCK_DISRUPTION_MARKER: %lu\n", read_disruption_marker(vmclock)); + printf("VMCLOCK_VM_GENERATION_COUNTER: %lu\n", read_vm_generation_counter(vmclock)); + fflush(stdout); +} + +void run_poll(int fd) +{ + struct vmclock_abi vmclock; + int epfd, ret, nfds; + struct epoll_event ev; + + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + + epfd = epoll_create(1); + if (epfd < 0) { + perror("epoll_create"); + exit(1); + } + + ev.events = EPOLLIN | EPOLLRDNORM; + ev.data.fd = fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + if (ret < 0) { + perror("epoll_add"); + exit(1); + } + + while (1) { + nfds = epoll_wait(epfd, &ev, 1, -1); + if (nfds < 0) { + perror("epoll_wait"); + exit(1); + } + + if (ev.data.fd != fd) { + fprintf(stderr, "Unknown file descriptor %d\n", ev.data.fd); + exit(1); + } + + if (ev.events & EPOLLHUP) { + fprintf(stderr, "Device does not support notifications. Stop polling\n"); + exit(1); + } else if (ev.events & EPOLLIN) { + fprintf(stdout, "Got VMClock notification\n"); + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + } + } +} + +void print_help_message() +{ + fprintf(stderr, "usage: vmclock MODE\n"); + fprintf(stderr, "Available modes:\n"); + fprintf(stderr, " -r\tRead vmclock_abi using read()\n"); + fprintf(stderr, " -m\tRead vmclock_abi using mmap()\n"); + fprintf(stderr, " -p\tPoll VMClock for changes\n"); +} + +int main(int argc, char *argv[]) +{ + int fd; + struct vmclock_abi vmclock, *vmclock_ptr; + + if (argc != 2) { + print_help_message(); + exit(1); + } + + fd = open_vmclock(); + + if (!strncmp(argv[1], "-r", 2)) { + printf("Reading VMClock with read()\n"); + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + } else if (!strncmp(argv[1], "-m", 2)) { + printf("Reading VMClock with mmap()\n"); + vmclock_ptr = get_vmclock_handle(fd); + print_vmclock(vmclock_ptr); + } else if (!strncmp(argv[1], "-p", 2)) { + printf("Polling VMClock\n"); + run_poll(fd); + } else { + print_help_message(); + exit(1); + } return 0; } diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index 54153b27d2d..bd51d4e53a9 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -18,13 +18,13 @@ def max_devices(uvm): match platform.machine(): case "aarch64": # On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for - # the VMGenID, RTC and serial devices, so the maximum number of devices supported - # at the same time is 93. - return 93 + # the VMGenID, VMClock, RTC and serial devices, so the maximum number of devices + # supported at the same time is 92. + return 92 case "x86_64": - # IRQs are available from 5 to 23. We always use one IRQ for VMGenID device, so - # the maximum number of devices supported at the same time is 18. - return 18 + # IRQs are available from 5 to 23. We always use one IRQ for VMGenID and VMClock + # devices, so the maximum number of devices supported at the same time is 17. + return 17 case _: raise ValueError("Unknown platform") diff --git a/tests/integration_tests/functional/test_vmclock.py b/tests/integration_tests/functional/test_vmclock.py index b487526abdb..925c6b021c5 100644 --- a/tests/integration_tests/functional/test_vmclock.py +++ b/tests/integration_tests/functional/test_vmclock.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 """Test VMclock device emulation""" -import platform - import pytest @@ -21,40 +19,88 @@ def vm_with_vmclock(uvm_plain_acpi, bin_vmclock_path): yield basevm -def parse_vmclock(vm): +def parse_vmclock(vm, use_mmap=False): """Parse the VMclock struct inside the guest and return a dictionary with its fields""" - _, stdout, _ = vm.ssh.check_output("/tmp/vmclock") + + cmd = "/tmp/vmclock -m" if use_mmap else "/tmp/vmclock -r" + _, stdout, _ = vm.ssh.check_output(cmd) + fields = stdout.strip().split("\n") + if use_mmap: + assert fields[0] == "Reading VMClock with mmap()" + else: + assert fields[0] == "Reading VMClock with read()" + + return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) + + +def parse_vmclock_from_poll(vm, expected_notifications): + """Parse the output of the 'vmclock -p' command in the guest""" + + _, stdout, _ = vm.ssh.check_output("cat /tmp/vmclock.out") fields = stdout.strip().split("\n") - return dict(item.split(": ") for item in fields) + + nr_notifications = 0 + for line in fields: + if line == "Got VMClock notification": + nr_notifications += 1 + + assert nr_notifications == expected_notifications + return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) -@pytest.mark.skipif( - platform.machine() != "x86_64", - reason="VMClock device is currently supported only on x86 systems", -) -def test_vmclock_fields(vm_with_vmclock): +@pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) +def test_vmclock_read_fields(vm_with_vmclock, use_mmap): """Make sure that we expose the expected values in the VMclock struct""" vm = vm_with_vmclock - vmclock = parse_vmclock(vm) + vmclock = parse_vmclock(vm, use_mmap) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" assert vmclock["VMCLOCK_MAGIC"] == "0x4b4c4356" assert vmclock["VMCLOCK_SIZE"] == "0x1000" assert vmclock["VMCLOCK_VERSION"] == "1" assert vmclock["VMCLOCK_CLOCK_STATUS"] == "0" assert vmclock["VMCLOCK_COUNTER_ID"] == "255" assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" -@pytest.mark.skipif( - platform.machine() != "x86_64", - reason="VMClock device is currently supported only on x86 systems", -) -def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): - """Test that `disruption_marker` is updated upon snapshot resume""" +@pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) +def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type, use_mmap): + """Test that `disruption_marker` and `vm_generation_counter` are updated + upon snapshot resume""" basevm = vm_with_vmclock - vmclock = parse_vmclock(basevm) + vmclock = parse_vmclock(basevm, use_mmap) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" + + snapshot = basevm.make_snapshot(snapshot_type) + basevm.kill() + + for i, vm in enumerate( + microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) + ): + vmclock = parse_vmclock(vm, use_mmap) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" + + +def test_vmclock_notifications(vm_with_vmclock, microvm_factory, snapshot_type): + """Test that Firecracker will send a notification on snapshot load""" + basevm = vm_with_vmclock + + # Launch vmclock utility in polling mode + basevm.ssh.check_output("/tmp/vmclock -p > /tmp/vmclock.out 2>&1 &") + + # We should not have received any notification yet + vmclock = parse_vmclock_from_poll(basevm, 0) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" snapshot = basevm.make_snapshot(snapshot_type) basevm.kill() @@ -62,5 +108,6 @@ def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): for i, vm in enumerate( microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) ): - vmclock = parse_vmclock(vm) + vmclock = parse_vmclock_from_poll(vm, i + 1) assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" diff --git a/tools/bindgen-patches/0004-vmclock-notify.patch b/tools/bindgen-patches/0004-vmclock-notify.patch new file mode 100644 index 00000000000..4458f777163 --- /dev/null +++ b/tools/bindgen-patches/0004-vmclock-notify.patch @@ -0,0 +1,33 @@ +diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +index 134c8393f..80228ad84 100644 +--- a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs ++++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +@@ -38,6 +38,8 @@ pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; + pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; + pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; + pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; ++pub const VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: u64 = 256; ++pub const VMCLOCK_FLAG_NOTIFICATION_PRESENT: u64 = 512; + pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; + pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; + pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; +@@ -153,10 +155,11 @@ pub struct vmclock_abi { + pub time_frac_sec: __le64, + pub time_esterror_nanosec: __le64, + pub time_maxerror_nanosec: __le64, ++ pub vm_generation_counter: __le64, + } + #[allow(clippy::unnecessary_operation, clippy::identity_op)] + const _: () = { +- ["Size of vmclock_abi"][::std::mem::size_of::() - 104usize]; ++ ["Size of vmclock_abi"][::std::mem::size_of::() - 112usize]; + ["Alignment of vmclock_abi"][::std::mem::align_of::() - 8usize]; + ["Offset of field: vmclock_abi::magic"][::std::mem::offset_of!(vmclock_abi, magic) - 0usize]; + ["Offset of field: vmclock_abi::size"][::std::mem::offset_of!(vmclock_abi, size) - 4usize]; +@@ -198,4 +201,6 @@ const _: () = { + [::std::mem::offset_of!(vmclock_abi, time_esterror_nanosec) - 88usize]; + ["Offset of field: vmclock_abi::time_maxerror_nanosec"] + [::std::mem::offset_of!(vmclock_abi, time_maxerror_nanosec) - 96usize]; ++ ["Offset of field: vmclock_abi::vm_generation_counter"] ++ [::std::mem::offset_of!(vmclock_abi, vm_generation_counter) - 104usize]; + };