Skip to content

Commit 21923bf

Browse files
authored
nvme_driver: support multi-page allocations for queues (#2061)
The vast majority of this work was done by @chris-oo , many thanks to him! The outstanding queue depth from the nvme driver can be limited by the number of entries in the IO submission queue. Prior to this change, OpenHCL's nvme driver supported 1 page worth of submission queue entries. After this change, the nvme driver will support as many queue entries as the device supports. Contiguous allocations are likely, but not strictly guaranteed. This code opts to allow forward progress by falling back to one page allocations. This is testable (and has been validated manually and with some of the existing CI tests) by not configuring a private pool. There are some other design tradeoffs, which are noted as comments in the change.
1 parent c6493ea commit 21923bf

File tree

11 files changed

+164
-20
lines changed

11 files changed

+164
-20
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4819,6 +4819,7 @@ dependencies = [
48194819
"inspect",
48204820
"mesh",
48214821
"open_enum",
4822+
"static_assertions",
48224823
"storage_string",
48234824
"zerocopy 0.8.25",
48244825
]
@@ -4998,6 +4999,7 @@ dependencies = [
49984999
"memory_range",
49995000
"mesh",
50005001
"page_pool_alloc",
5002+
"tracing",
50015003
"user_driver",
50025004
"virt",
50035005
"vmcore",

openhcl/openhcl_dma_manager/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ virt.workspace = true
2121
vmcore.workspace = true
2222

2323
anyhow.workspace = true
24+
tracing.workspace = true
2425

2526
[lints]
2627
workspace = true

openhcl/openhcl_dma_manager/src/lib.rs

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,20 @@ impl DmaManagerInner {
286286
allocation_visibility: AllocationVisibility::Private,
287287
persistent_allocations: false,
288288
shared_spawner: _,
289-
private_spawner: _,
289+
private_spawner,
290290
} => match lower_vtl_policy {
291291
LowerVtlPermissionPolicy::Any => {
292292
// No persistence needed means the `LockedMemorySpawner`
293293
// using normal VTL2 ram is fine.
294-
DmaClientBacking::LockedMemory(LockedMemorySpawner)
294+
match private_spawner {
295+
Some(private) => DmaClientBacking::PrivatePoolWithFallback((
296+
private
297+
.allocator(device_name.into())
298+
.context("failed to create private allocator")?,
299+
LockedMemorySpawner,
300+
)),
301+
None => DmaClientBacking::LockedMemory(LockedMemorySpawner),
302+
}
295303
}
296304
LowerVtlPermissionPolicy::Vtl0 => {
297305
// `LockedMemorySpawner` uses private VTL2 ram, so
@@ -416,6 +424,7 @@ enum DmaClientBacking {
416424
SharedPool(#[inspect(skip)] PagePoolAllocator),
417425
PrivatePool(#[inspect(skip)] PagePoolAllocator),
418426
LockedMemory(#[inspect(skip)] LockedMemorySpawner),
427+
PrivatePoolWithFallback(#[inspect(skip)] (PagePoolAllocator, LockedMemorySpawner)),
419428
PrivatePoolLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<PagePoolAllocator>),
420429
LockedMemoryLowerVtl(#[inspect(skip)] LowerVtlMemorySpawner<LockedMemorySpawner>),
421430
}
@@ -429,6 +438,16 @@ impl DmaClientBacking {
429438
DmaClientBacking::SharedPool(allocator) => allocator.allocate_dma_buffer(total_size),
430439
DmaClientBacking::PrivatePool(allocator) => allocator.allocate_dma_buffer(total_size),
431440
DmaClientBacking::LockedMemory(spawner) => spawner.allocate_dma_buffer(total_size),
441+
DmaClientBacking::PrivatePoolWithFallback((allocator, spawner)) => {
442+
allocator.allocate_dma_buffer(total_size).or_else(|err| {
443+
tracing::warn!(
444+
size = total_size,
445+
error = ?err,
446+
"falling back to locked memory for dma allocation"
447+
);
448+
spawner.allocate_dma_buffer(total_size)
449+
})
450+
}
432451
DmaClientBacking::PrivatePoolLowerVtl(spawner) => {
433452
spawner.allocate_dma_buffer(total_size)
434453
}
@@ -442,7 +461,16 @@ impl DmaClientBacking {
442461
match self {
443462
DmaClientBacking::SharedPool(allocator) => allocator.attach_pending_buffers(),
444463
DmaClientBacking::PrivatePool(allocator) => allocator.attach_pending_buffers(),
445-
DmaClientBacking::LockedMemory(spawner) => spawner.attach_pending_buffers(),
464+
DmaClientBacking::PrivatePoolWithFallback(_) => {
465+
anyhow::bail!("cannot attach pending buffers with fallback allocator")
466+
}
467+
DmaClientBacking::LockedMemory(_) => {
468+
anyhow::bail!(
469+
"attaching pending buffers is not supported with locked memory; \
470+
this client type does not maintain a pool of pending allocations. \
471+
To use attach_pending_buffers, create a client backed by a shared or private pool."
472+
)
473+
}
446474
DmaClientBacking::PrivatePoolLowerVtl(spawner) => spawner.attach_pending_buffers(),
447475
DmaClientBacking::LockedMemoryLowerVtl(spawner) => spawner.attach_pending_buffers(),
448476
}

vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,9 @@ impl<T: DeviceBacking> NvmeDriver<T> {
325325
)
326326
.context("failed to create admin queue pair")?;
327327

328+
let admin_sqes = admin.sq_entries();
329+
let admin_cqes = admin.cq_entries();
330+
328331
let admin = worker.admin.insert(admin);
329332

330333
// Register the admin queue with the controller.
@@ -452,6 +455,13 @@ impl<T: DeviceBacking> NvmeDriver<T> {
452455
let io_cqsize = (MAX_CQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
453456
let io_sqsize = (MAX_SQ_ENTRIES - 1).min(worker.registers.cap.mqes_z()) + 1;
454457

458+
tracing::debug!(
459+
io_cqsize,
460+
io_sqsize,
461+
hw_size = worker.registers.cap.mqes_z(),
462+
"io queue sizes"
463+
);
464+
455465
// Some hardware (such as ASAP) require that the sq and cq have the same size.
456466
io_cqsize.min(io_sqsize)
457467
};
@@ -630,7 +640,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
630640
admin: None, // Updated below.
631641
identify: Some(Arc::new(
632642
spec::IdentifyController::read_from_bytes(saved_state.identify_ctrl.as_bytes())
633-
.map_err(|_| RestoreError::InvalidData)?, // TODO: zerocopy: map_err (https://github.com/microsoft/openvmm/issues/759)
643+
.map_err(|_| RestoreError::InvalidData)?,
634644
)),
635645
driver: driver.clone(),
636646
io_issuers,
@@ -955,6 +965,9 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
955965
)
956966
.map_err(|err| DeviceError::IoQueuePairCreationFailure(err, qid))?;
957967

968+
assert_eq!(queue.sq_entries(), queue.cq_entries());
969+
state.qsize = queue.sq_entries();
970+
958971
let io_sq_addr = queue.sq_addr();
959972
let io_cq_addr = queue.cq_addr();
960973

vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,23 @@ use zerocopy::FromZeros;
4646

4747
/// Value for unused PRP entries, to catch/mitigate buffer size mismatches.
4848
const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
49-
/// Maximum SQ size in entries.
50-
pub const MAX_SQ_ENTRIES: u16 = (PAGE_SIZE / 64) as u16;
51-
/// Maximum CQ size in entries.
52-
pub const MAX_CQ_ENTRIES: u16 = (PAGE_SIZE / 16) as u16;
49+
50+
const SQ_ENTRY_SIZE: usize = size_of::<spec::Command>();
51+
const CQ_ENTRY_SIZE: usize = size_of::<spec::Completion>();
5352
/// Submission Queue size in bytes.
54-
const SQ_SIZE: usize = PAGE_SIZE;
53+
const SQ_SIZE: usize = PAGE_SIZE * 4;
5554
/// Completion Queue size in bytes.
5655
const CQ_SIZE: usize = PAGE_SIZE;
56+
/// Maximum SQ size in entries.
57+
pub const MAX_SQ_ENTRIES: u16 = (SQ_SIZE / SQ_ENTRY_SIZE) as u16;
58+
/// Maximum CQ size in entries.
59+
pub const MAX_CQ_ENTRIES: u16 = (CQ_SIZE / CQ_ENTRY_SIZE) as u16;
5760
/// Number of pages per queue if bounce buffering.
5861
const PER_QUEUE_PAGES_BOUNCE_BUFFER: usize = 128;
5962
/// Number of pages per queue if not bounce buffering.
6063
const PER_QUEUE_PAGES_NO_BOUNCE_BUFFER: usize = 64;
64+
/// Number of SQ entries per page (64).
65+
const SQ_ENTRIES_PER_PAGE: usize = PAGE_SIZE / SQ_ENTRY_SIZE;
6166

6267
#[derive(Inspect)]
6368
pub(crate) struct QueuePair<T: AerHandler> {
@@ -75,6 +80,8 @@ pub(crate) struct QueuePair<T: AerHandler> {
7580
sq_entries: u16,
7681
#[inspect(skip)]
7782
cq_entries: u16,
83+
sq_addr: u64,
84+
cq_addr: u64,
7885
}
7986

8087
impl PendingCommands {
@@ -177,17 +184,31 @@ impl PendingCommands {
177184
}
178185

179186
impl<T: AerHandler> QueuePair<T> {
187+
/// Create a new queue pair.
188+
///
189+
/// `sq_entries` and `cq_entries` are the requested sizes in entries.
190+
/// Calling code should request the largest size it thinks the device
191+
/// will support (see `CAP.MQES`). These may be clamped down to what will
192+
/// fit in one page should this routine fail to allocate physically
193+
/// contiguous memory to back the queues.
194+
/// IMPORTANT: Calling code should check the actual sizes via corresponding
195+
/// calls to [`QueuePair::sq_entries`] and [`QueuePair::cq_entries`] AFTER calling this routine.
180196
pub fn new(
181197
spawner: impl SpawnDriver,
182198
device: &impl DeviceBacking,
183199
qid: u16,
184-
sq_entries: u16, // Requested SQ size in entries.
185-
cq_entries: u16, // Requested CQ size in entries.
200+
sq_entries: u16,
201+
cq_entries: u16,
186202
interrupt: DeviceInterrupt,
187203
registers: Arc<DeviceRegisters<impl DeviceBacking>>,
188204
bounce_buffer: bool,
189205
aer_handler: T,
190206
) -> anyhow::Result<Self> {
207+
// FUTURE: Consider splitting this into several allocations, rather than
208+
// allocating the sum total together. This can increase the likelihood
209+
// of getting contiguous memory when falling back to the LockedMem
210+
// allocator, but this is not the expected path. Be careful that any
211+
// changes you make here work with already established save state.
191212
let total_size = SQ_SIZE
192213
+ CQ_SIZE
193214
+ if bounce_buffer {
@@ -196,6 +217,10 @@ impl<T: AerHandler> QueuePair<T> {
196217
PER_QUEUE_PAGES_NO_BOUNCE_BUFFER * PAGE_SIZE
197218
};
198219
let dma_client = device.dma_client();
220+
221+
// TODO: Keepalive: Detect when the allocation came from outside
222+
// the private pool and put the device in a degraded state, so it
223+
// is possible to inspect that a servicing with keepalive will fail.
199224
let mem = dma_client
200225
.allocate_dma_buffer(total_size)
201226
.context("failed to allocate memory for queues")?;
@@ -217,12 +242,11 @@ impl<T: AerHandler> QueuePair<T> {
217242
)
218243
}
219244

220-
/// Create new object or restore from saved state.
221245
fn new_or_restore(
222246
spawner: impl SpawnDriver,
223247
qid: u16,
224-
sq_entries: u16, // Submission queue entries.
225-
cq_entries: u16, // Completion queue entries.
248+
sq_entries: u16,
249+
cq_entries: u16,
226250
mut interrupt: DeviceInterrupt,
227251
registers: Arc<DeviceRegisters<impl DeviceBacking>>,
228252
mem: MemoryBlock,
@@ -235,6 +259,49 @@ impl<T: AerHandler> QueuePair<T> {
235259
let cq_mem_block = mem.subblock(SQ_SIZE, CQ_SIZE);
236260
let data_offset = SQ_SIZE + CQ_SIZE;
237261

262+
// Make sure that the queue memory is physically contiguous. While the
263+
// NVMe spec allows for some provisions of queue memory to be
264+
// non-contiguous, this depends on device support. At least one device
265+
// that we must support requires that the memory is contiguous (via the
266+
// CAP.CQR bit). Because of that, just simplify the code paths to use
267+
// contiguous memory.
268+
//
269+
// We could also seek through the memory block to find contiguous pages
270+
// (for example, if the first 4 pages are not contiguous, but pages 5-8
271+
// are, use those), but other parts of this driver already assume the
272+
// math to get the correct offsets.
273+
//
274+
// N.B. It is expected that allocations from the private pool will
275+
// always be contiguous, and that is the normal path. That can fail in
276+
// some cases (e.g. if we got some guesses about memory size wrong), and
277+
// we prefer to operate in a perf degraded state rather than fail
278+
// completely.
279+
280+
let (sq_is_contiguous, cq_is_contiguous) = (
281+
sq_mem_block.contiguous_pfns(),
282+
cq_mem_block.contiguous_pfns(),
283+
);
284+
285+
let (sq_entries, cq_entries) = if !sq_is_contiguous || !cq_is_contiguous {
286+
tracing::warn!(
287+
qid,
288+
sq_is_contiguous,
289+
sq_mem_block.pfns = ?sq_mem_block.pfns(),
290+
cq_is_contiguous,
291+
cq_mem_block.pfns = ?cq_mem_block.pfns(),
292+
"non-contiguous queue memory detected, falling back to single page queues"
293+
);
294+
// Clamp both queues to the number of entries that will fit in a
295+
// single SQ page (since this will be the smaller between the SQ and
296+
// CQ capacity).
297+
(SQ_ENTRIES_PER_PAGE as u16, SQ_ENTRIES_PER_PAGE as u16)
298+
} else {
299+
(sq_entries, cq_entries)
300+
};
301+
302+
let sq_addr = sq_mem_block.pfns()[0] * PAGE_SIZE64;
303+
let cq_addr = cq_mem_block.pfns()[0] * PAGE_SIZE64;
304+
238305
let mut queue_handler = match saved_state {
239306
Some(s) => QueueHandler::restore(sq_mem_block, cq_mem_block, s, aer_handler)?,
240307
None => {
@@ -296,15 +363,27 @@ impl<T: AerHandler> QueuePair<T> {
296363
qid,
297364
sq_entries,
298365
cq_entries,
366+
sq_addr,
367+
cq_addr,
299368
})
300369
}
301370

371+
/// Returns the actual number of SQ entries supported by this queue pair.
372+
pub fn sq_entries(&self) -> u16 {
373+
self.sq_entries
374+
}
375+
376+
/// Returns the actual number of CQ entries supported by this queue pair.
377+
pub fn cq_entries(&self) -> u16 {
378+
self.cq_entries
379+
}
380+
302381
pub fn sq_addr(&self) -> u64 {
303-
self.mem.pfns()[0] * PAGE_SIZE64
382+
self.sq_addr
304383
}
305384

306385
pub fn cq_addr(&self) -> u64 {
307-
self.mem.pfns()[1] * PAGE_SIZE64
386+
self.cq_addr
308387
}
309388

310389
pub fn issuer(&self) -> &Arc<Issuer> {

vm/devices/storage/disk_nvme/nvme_driver/src/queues.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ pub(crate) struct QueueFull;
3131

3232
impl SubmissionQueue {
3333
pub fn new(sqid: u16, len: u16, mem: MemoryBlock) -> Self {
34+
tracing::debug!(sqid, len, pfns = ?mem.pfns(), "new submission queue");
35+
3436
Self {
3537
sqid,
3638
head: 0,
@@ -122,6 +124,7 @@ pub(crate) struct CompletionQueue {
122124

123125
impl CompletionQueue {
124126
pub fn new(cqid: u16, len: u16, mem: MemoryBlock) -> CompletionQueue {
127+
tracing::debug!(cqid, len, pfns = ?mem.pfns(), "new completion queue");
125128
Self {
126129
cqid,
127130
head: 0,
@@ -138,8 +141,7 @@ impl CompletionQueue {
138141

139142
pub fn read(&mut self) -> Option<spec::Completion> {
140143
let completion_mem = self.mem.as_slice()
141-
[self.head as usize * size_of::<spec::Completion>()..]
142-
[..size_of::<spec::Completion>() * 2]
144+
[self.head as usize * size_of::<spec::Completion>()..][..size_of::<spec::Completion>()]
143145
.as_atomic_slice::<AtomicU64>()
144146
.unwrap();
145147

vm/devices/storage/nvme_spec/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ mesh.workspace = true
1515
bitfield-struct.workspace = true
1616
open_enum.workspace = true
1717
zerocopy.workspace = true
18+
static_assertions.workspace = true
1819

1920
[lints]
2021
workspace = true

vm/devices/storage/nvme_spec/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ pub struct Command {
157157
pub cdw14: u32,
158158
pub cdw15: u32,
159159
}
160+
static_assertions::assert_eq_size!(Command, [u8; 64]);
160161

161162
#[derive(Inspect)]
162163
#[bitfield(u32)]
@@ -230,6 +231,7 @@ pub struct Completion {
230231
pub cid: u16,
231232
pub status: CompletionStatus,
232233
}
234+
static_assertions::assert_eq_size!(Completion, [u8; 16]);
233235

234236
#[bitfield(u16)]
235237
#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, MeshPayload)]

vm/devices/user_driver/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ pub trait DmaClient: Send + Sync + Inspect {
7272
/// Allocate a new DMA buffer. This buffer must be zero initialized.
7373
///
7474
/// TODO: string tag for allocation?
75+
/// TODO: contiguous vs non-contiguous? (both on the request side, and if
76+
/// a request contiguous allocation cannot be fulfilled)
7577
fn allocate_dma_buffer(&self, total_size: usize) -> anyhow::Result<MemoryBlock>;
7678

7779
/// Attach all previously allocated memory blocks.

vm/devices/user_driver/src/memory.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,20 @@ impl MemoryBlock {
156156
self.mem.pfn_bias()
157157
}
158158

159+
/// Returns true if the PFNs are contiguous.
160+
///
161+
/// TODO: Fallback allocations are here for now, but we should eventually
162+
/// allow the caller to require these. See `DmaClient::allocate_dma_buffer`.
163+
pub fn contiguous_pfns(&self) -> bool {
164+
for (curr, next) in self.pfns().iter().zip(self.pfns().iter().skip(1)) {
165+
if *curr + 1 != *next {
166+
return false;
167+
}
168+
}
169+
170+
true
171+
}
172+
159173
/// Gets the buffer as an atomic slice.
160174
pub fn as_slice(&self) -> &[AtomicU8] {
161175
// SAFETY: the underlying memory is valid for the lifetime of `mem`.

0 commit comments

Comments
 (0)