nvme_driver: support multi-page allocations for queues (#2061)

mattkur · web-flow · commit 21923bf87397 · 2025-10-16T08:32:33.000-07:00
The vast majority of this work was done by @chris-oo , many thanks to him! The outstanding queue depth from the nvme driver can be limited by the number of entries in the IO submission queue. Prior to this change, OpenHCL's nvme driver supported 1 page worth of submission queue entries. After this change, the nvme driver will support as many queue entries as the device supports. Contiguous allocations are likely, but not strictly guaranteed. This code opts to allow forward progress by falling back to one page allocations. This is testable (and has been validated manually and with some of the existing CI tests) by not configuring a private pool. There are some other design tradeoffs, which are noted as comments in the change.
diff --git a/Cargo.lock b/Cargo.lock
@@ -4819,6 +4819,7 @@ dependencies = [
  "inspect",
  "mesh",
  "open_enum",
+ "static_assertions",
  "storage_string",
  "zerocopy 0.8.25",
 ]
@@ -4998,6 +4999,7 @@ dependencies = [
  "memory_range",
  "mesh",
  "page_pool_alloc",
+ "tracing",
  "user_driver",
  "virt",
  "vmcore",
diff --git a/openhcl/openhcl_dma_manager/Cargo.toml b/openhcl/openhcl_dma_manager/Cargo.toml
@@ -21,6 +21,7 @@ virt.workspace = true
 vmcore.workspace = true
 
 anyhow.workspace = true
+tracing.workspace = true
 
 [lints]
 workspace = true
diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs
@@ -286,12 +286,20 @@ impl DmaManagerInner {
                     allocation_visibility: AllocationVisibility::Private,
                     persistent_allocations: false,
                     shared_spawner: _,
-                    private_spawner: _,
+                    private_spawner,
                 } => match lower_vtl_policy {
                     LowerVtlPermissionPolicy::Any => {
                         // No persistence needed means the `LockedMemorySpawner`
                         // using normal VTL2 ram is fine.
-                        DmaClientBacking::LockedMemory(LockedMemorySpawner)
+                        match private_spawner {
+                            Some(private) => DmaClientBacking::PrivatePoolWithFallback((
+                                private
+                                    .allocator(device_name.into())
+                                    .context("failed to create private allocator")?,
+                                LockedMemorySpawner,
+                            )),
+                            None => DmaClientBacking::LockedMemory(LockedMemorySpawner),
+                        }
                     }
                     LowerVtlPermissionPolicy::Vtl0 => {
                         // `LockedMemorySpawner` uses private VTL2 ram, so
@@ -416,6 +424,7 @@ enum DmaClientBacking {
     SharedPool(#[inspect(skip)] PagePoolAllocator),
     PrivatePool(#[inspect(skip)] PagePoolAllocator),
     LockedMemory(#[inspect(skip)] LockedMemorySpawner),
+    PrivatePoolWithFallback(#[inspect(skip)] (PagePoolAllocator, LockedMemorySpawner)),
     PrivatePoolLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<PagePoolAllocator>),
     LockedMemoryLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<LockedMemorySpawner>),
 }
@@ -429,6 +438,16 @@ impl DmaClientBacking {
             DmaClientBacking::SharedPool(allocator) => allocator.allocate_dma_buffer(total_size),
             DmaClientBacking::PrivatePool(allocator) => allocator.allocate_dma_buffer(total_size),
             DmaClientBacking::LockedMemory(spawner) => spawner.allocate_dma_buffer(total_size),
+            DmaClientBacking::PrivatePoolWithFallback((allocator, spawner)) => {
+                allocator.allocate_dma_buffer(total_size).or_else(|err| {
+                    tracing::warn!(
+                        size = total_size,
+                        error = ?err,
+                        "falling back to locked memory for dma allocation"
+                    );
+                    spawner.allocate_dma_buffer(total_size)
+                })
+            }
             DmaClientBacking::PrivatePoolLowerVtl(spawner) => {
                 spawner.allocate_dma_buffer(total_size)
             }
@@ -442,7 +461,16 @@ impl DmaClientBacking {
         match self {
             DmaClientBacking::SharedPool(allocator) => allocator.attach_pending_buffers(),
             DmaClientBacking::PrivatePool(allocator) => allocator.attach_pending_buffers(),
-            DmaClientBacking::LockedMemory(spawner) => spawner.attach_pending_buffers(),
+            DmaClientBacking::PrivatePoolWithFallback(_) => {
+                anyhow::bail!("cannot attach pending buffers with fallback allocator")
+            }
+            DmaClientBacking::LockedMemory(_) => {
+                anyhow::bail!(
+                    "attaching pending buffers is not supported with locked memory; \
+                    this client type does not maintain a pool of pending allocations. \
+                    To use attach_pending_buffers, create a client backed by a shared or private pool."
+                )
+            }
             DmaClientBacking::PrivatePoolLowerVtl(spawner) => spawner.attach_pending_buffers(),
             DmaClientBacking::LockedMemoryLowerVtl(spawner) => spawner.attach_pending_buffers(),
         }
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
@@ -325,6 +325,9 @@ impl<T: DeviceBacking> NvmeDriver<T> {
         )
         .context("failed to create admin queue pair")?;
 
+        let admin_sqes = admin.sq_entries();
+        let admin_cqes = admin.cq_entries();
+
         let admin = worker.admin.insert(admin);
 
         // Register the admin queue with the controller.
@@ -452,6 +455,13 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             let io_cqsize = (MAX_CQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
             let io_sqsize = (MAX_SQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
 
+            tracing::debug!(
+                io_cqsize,
+                io_sqsize,
+                hw_size = worker.registers.cap.mqes_z(),
+                "io queue sizes"
+            );
+
             // Some hardware (such as ASAP) require that the sq and cq have the same size.
             io_cqsize.min(io_sqsize)
         };
@@ -630,7 +640,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             admin: None, // Updated below.
             identify: Some(Arc::new(
                 spec::IdentifyController::read_from_bytes(saved_state.identify_ctrl.as_bytes())
-                    .map_err(|_| RestoreError::InvalidData)?, // TODO: zerocopy: map_err (https://github.com/microsoft/openvmm/issues/759)
+                    .map_err(|_| RestoreError::InvalidData)?,
             )),
             driver: driver.clone(),
             io_issuers,
@@ -955,6 +965,9 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
         )
         .map_err(|err| DeviceError::IoQueuePairCreationFailure(err, qid))?;
 
+        assert_eq!(queue.sq_entries(), queue.cq_entries());
+        state.qsize = queue.sq_entries();
+
         let io_sq_addr = queue.sq_addr();
         let io_cq_addr = queue.cq_addr();
 
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
@@ -46,18 +46,23 @@ use zerocopy::FromZeros;
 
 /// Value for unused PRP entries, to catch/mitigate buffer size mismatches.
 const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
-/// Maximum SQ size in entries.
-pub const MAX_SQ_ENTRIES: u16 = (PAGE_SIZE / 64) as u16;
-/// Maximum CQ size in entries.
-pub const MAX_CQ_ENTRIES: u16 = (PAGE_SIZE / 16) as u16;
+
+const SQ_ENTRY_SIZE: usize = size_of::<spec::Command>();
+const CQ_ENTRY_SIZE: usize = size_of::<spec::Completion>();
 /// Submission Queue size in bytes.
-const SQ_SIZE: usize = PAGE_SIZE;
+const SQ_SIZE: usize = PAGE_SIZE * 4;
 /// Completion Queue size in bytes.
 const CQ_SIZE: usize = PAGE_SIZE;
+/// Maximum SQ size in entries.
+pub const MAX_SQ_ENTRIES: u16 = (SQ_SIZE / SQ_ENTRY_SIZE) as u16;
+/// Maximum CQ size in entries.
+pub const MAX_CQ_ENTRIES: u16 = (CQ_SIZE / CQ_ENTRY_SIZE) as u16;
 /// Number of pages per queue if bounce buffering.
 const PER_QUEUE_PAGES_BOUNCE_BUFFER: usize = 128;
 /// Number of pages per queue if not bounce buffering.
 const PER_QUEUE_PAGES_NO_BOUNCE_BUFFER: usize = 64;
+/// Number of SQ entries per page (64).
+const SQ_ENTRIES_PER_PAGE: usize = PAGE_SIZE / SQ_ENTRY_SIZE;
 
 #[derive(Inspect)]
 pub(crate) struct QueuePair<T: AerHandler> {
@@ -75,6 +80,8 @@ pub(crate) struct QueuePair<T: AerHandler> {
     sq_entries: u16,
     #[inspect(skip)]
     cq_entries: u16,
+    sq_addr: u64,
+    cq_addr: u64,
 }
 
 impl PendingCommands {
@@ -177,17 +184,31 @@ impl PendingCommands {
 }
 
 impl<T: AerHandler> QueuePair<T> {
+    /// Create a new queue pair.
+    ///
+    /// `sq_entries` and `cq_entries` are the requested sizes in entries.
+    /// Calling code should request the largest size it thinks the device
+    /// will support (see `CAP.MQES`). These may be clamped down to what will
+    /// fit in one page should this routine fail to allocate physically
+    /// contiguous memory to back the queues.
+    /// IMPORTANT: Calling code should check the actual sizes via corresponding
+    /// calls to [`QueuePair::sq_entries`] and [`QueuePair::cq_entries`] AFTER calling this routine.
     pub fn new(
         spawner: impl SpawnDriver,
         device: &impl DeviceBacking,
         qid: u16,
-        sq_entries: u16, // Requested SQ size in entries.
-        cq_entries: u16, // Requested CQ size in entries.
+        sq_entries: u16,
+        cq_entries: u16,
         interrupt: DeviceInterrupt,
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         bounce_buffer: bool,
         aer_handler: T,
     ) -> anyhow::Result<Self> {
+        // FUTURE: Consider splitting this into several allocations, rather than
+        // allocating the sum total together. This can increase the likelihood
+        // of getting contiguous memory when falling back to the LockedMem
+        // allocator, but this is not the expected path. Be careful that any
+        // changes you make here work with already established save state.
         let total_size = SQ_SIZE
             + CQ_SIZE
             + if bounce_buffer {
@@ -196,6 +217,10 @@ impl<T: AerHandler> QueuePair<T> {
                 PER_QUEUE_PAGES_NO_BOUNCE_BUFFER * PAGE_SIZE
             };
         let dma_client = device.dma_client();
+
+        // TODO: Keepalive: Detect when the allocation came from outside
+        // the private pool and put the device in a degraded state, so it
+        // is possible to inspect that a servicing with keepalive will fail.
         let mem = dma_client
             .allocate_dma_buffer(total_size)
             .context("failed to allocate memory for queues")?;
@@ -217,12 +242,11 @@ impl<T: AerHandler> QueuePair<T> {
         )
     }
 
-    /// Create new object or restore from saved state.
     fn new_or_restore(
         spawner: impl SpawnDriver,
         qid: u16,
-        sq_entries: u16, // Submission queue entries.
-        cq_entries: u16, // Completion queue entries.
+        sq_entries: u16,
+        cq_entries: u16,
         mut interrupt: DeviceInterrupt,
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         mem: MemoryBlock,
@@ -235,6 +259,49 @@ impl<T: AerHandler> QueuePair<T> {
         let cq_mem_block = mem.subblock(SQ_SIZE, CQ_SIZE);
         let data_offset = SQ_SIZE + CQ_SIZE;
 
+        // Make sure that the queue memory is physically contiguous. While the
+        // NVMe spec allows for some provisions of queue memory to be
+        // non-contiguous, this depends on device support. At least one device
+        // that we must support requires that the memory is contiguous (via the
+        // CAP.CQR bit). Because of that, just simplify the code paths to use
+        // contiguous memory.
+        //
+        // We could also seek through the memory block to find contiguous pages
+        // (for example, if the first 4 pages are not contiguous, but pages 5-8
+        // are, use those), but other parts of this driver already assume the
+        // math to get the correct offsets.
+        //
+        // N.B. It is expected that allocations from the private pool will
+        // always be contiguous, and that is the normal path. That can fail in
+        // some cases (e.g. if we got some guesses about memory size wrong), and
+        // we prefer to operate in a perf degraded state rather than fail
+        // completely.
+
+        let (sq_is_contiguous, cq_is_contiguous) = (
+            sq_mem_block.contiguous_pfns(),
+            cq_mem_block.contiguous_pfns(),
+        );
+
+        let (sq_entries, cq_entries) = if !sq_is_contiguous || !cq_is_contiguous {
+            tracing::warn!(
+                qid,
+                sq_is_contiguous,
+                sq_mem_block.pfns = ?sq_mem_block.pfns(),
+                cq_is_contiguous,
+                cq_mem_block.pfns = ?cq_mem_block.pfns(),
+                "non-contiguous queue memory detected, falling back to single page queues"
+            );
+            // Clamp both queues to the number of entries that will fit in a
+            // single SQ page (since this will be the smaller between the SQ and
+            // CQ capacity).
+            (SQ_ENTRIES_PER_PAGE as u16, SQ_ENTRIES_PER_PAGE as u16)
+        } else {
+            (sq_entries, cq_entries)
+        };
+
+        let sq_addr = sq_mem_block.pfns()[0] * PAGE_SIZE64;
+        let cq_addr = cq_mem_block.pfns()[0] * PAGE_SIZE64;
+
         let mut queue_handler = match saved_state {
             Some(s) => QueueHandler::restore(sq_mem_block, cq_mem_block, s, aer_handler)?,
             None => {
@@ -296,15 +363,27 @@ impl<T: AerHandler> QueuePair<T> {
             qid,
             sq_entries,
             cq_entries,
+            sq_addr,
+            cq_addr,
         })
     }
 
+    /// Returns the actual number of SQ entries supported by this queue pair.
+    pub fn sq_entries(&self) -> u16 {
+        self.sq_entries
+    }
+
+    /// Returns the actual number of CQ entries supported by this queue pair.
+    pub fn cq_entries(&self) -> u16 {
+        self.cq_entries
+    }
+
     pub fn sq_addr(&self) -> u64 {
-        self.mem.pfns()[0] * PAGE_SIZE64
+        self.sq_addr
     }
 
     pub fn cq_addr(&self) -> u64 {
-        self.mem.pfns()[1] * PAGE_SIZE64
+        self.cq_addr
     }
 
     pub fn issuer(&self) -> &Arc<Issuer> {
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs
@@ -31,6 +31,8 @@ pub(crate) struct QueueFull;
 
 impl SubmissionQueue {
     pub fn new(sqid: u16, len: u16, mem: MemoryBlock) -> Self {
+        tracing::debug!(sqid, len, pfns = ?mem.pfns(), "new submission queue");
+
         Self {
             sqid,
             head: 0,
@@ -122,6 +124,7 @@ pub(crate) struct CompletionQueue {
 
 impl CompletionQueue {
     pub fn new(cqid: u16, len: u16, mem: MemoryBlock) -> CompletionQueue {
+        tracing::debug!(cqid, len, pfns = ?mem.pfns(), "new completion queue");
         Self {
             cqid,
             head: 0,
@@ -138,8 +141,7 @@ impl CompletionQueue {
 
     pub fn read(&mut self) -> Option<spec::Completion> {
         let completion_mem = self.mem.as_slice()
-            [self.head as usize * size_of::<spec::Completion>()..]
-            [..size_of::<spec::Completion>() * 2]
+            [self.head as usize * size_of::<spec::Completion>()..][..size_of::<spec::Completion>()]
             .as_atomic_slice::<AtomicU64>()
             .unwrap();
 
diff --git a/vm/devices/storage/nvme_spec/Cargo.toml b/vm/devices/storage/nvme_spec/Cargo.toml
@@ -15,6 +15,7 @@ mesh.workspace = true
 bitfield-struct.workspace = true
 open_enum.workspace = true
 zerocopy.workspace = true
+static_assertions.workspace = true
 
 [lints]
 workspace = true
diff --git a/vm/devices/storage/nvme_spec/src/lib.rs b/vm/devices/storage/nvme_spec/src/lib.rs
@@ -157,6 +157,7 @@ pub struct Command {
     pub cdw14: u32,
     pub cdw15: u32,
 }
+static_assertions::assert_eq_size!(Command, [u8; 64]);
 
 #[derive(Inspect)]
 #[bitfield(u32)]
@@ -230,6 +231,7 @@ pub struct Completion {
     pub cid: u16,
     pub status: CompletionStatus,
 }
+static_assertions::assert_eq_size!(Completion, [u8; 16]);
 
 #[bitfield(u16)]
 #[derive(IntoBytes, Immutable, KnownLayout, FromBytes, MeshPayload)]
diff --git a/vm/devices/user_driver/src/lib.rs b/vm/devices/user_driver/src/lib.rs
@@ -72,6 +72,8 @@ pub trait DmaClient: Send + Sync + Inspect {
     /// Allocate a new DMA buffer. This buffer must be zero initialized.
     ///
     /// TODO: string tag for allocation?
+    /// TODO: contiguous vs non-contiguous? (both on the request side, and if
+    ///       a request contiguous allocation cannot be fulfilled)
     fn allocate_dma_buffer(&self, total_size: usize) -> anyhow::Result<MemoryBlock>;
 
     /// Attach all previously allocated memory blocks.
diff --git a/vm/devices/user_driver/src/memory.rs b/vm/devices/user_driver/src/memory.rs
@@ -156,6 +156,20 @@ impl MemoryBlock {
         self.mem.pfn_bias()
     }
 
+    /// Returns true if the PFNs are contiguous.
+    ///
+    /// TODO: Fallback allocations are here for now, but we should eventually
+    /// allow the caller to require these. See `DmaClient::allocate_dma_buffer`.
+    pub fn contiguous_pfns(&self) -> bool {
+        for (curr, next) in self.pfns().iter().zip(self.pfns().iter().skip(1)) {
+            if *curr + 1 != *next {
+                return false;
+            }
+        }
+
+        true
+    }
+
     /// Gets the buffer as an atomic slice.
     pub fn as_slice(&self) -> &[AtomicU8] {
         // SAFETY: the underlying memory is valid for the lifetime of `mem`.
diff --git a/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs b/vmm_tests/vmm_tests/tests/tests/x86_64/openhcl_uefi.rs

Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,7 @@ pub struct Command {`
`157`	`157`	`pub cdw14: u32,`
`158`	`158`	`pub cdw15: u32,`
`159`	`159`	`}`
	`160`	`+static_assertions::assert_eq_size!(Command, [u8; 64]);`
`160`	`161`
`161`	`162`	`#[derive(Inspect)]`
`162`	`163`	`#[bitfield(u32)]`
`@@ -230,6 +231,7 @@ pub struct Completion {`
`230`	`231`	`pub cid: u16,`
`231`	`232`	`pub status: CompletionStatus,`
`232`	`233`	`}`
	`234`	`+static_assertions::assert_eq_size!(Completion, [u8; 16]);`
`233`	`235`
`234`	`236`	`#[bitfield(u16)]`
`235`	`237`	`#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, MeshPayload)]`