diff --git a/Cargo.lock b/Cargo.lock index fcb9e381fd6..2498d6e843f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3569,6 +3569,7 @@ dependencies = [ "serde", "serde_json", "slog", + "thiserror 2.0.12", "uuid", ] diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index d50d2a2be21..7633fa95e38 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -22,5 +22,6 @@ serde.workspace = true serde_json.workspace = true schemars.workspace = true slog.workspace = true +thiserror.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index c980ff0dfe6..c4b16790362 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -63,7 +63,7 @@ progenitor::generate_api!( HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] }, ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] }, - RotImageError = { derives = [ PartialEq, Eq, PartialOrd, Ord] }, + RotImageError = { derives = [ thiserror::Error, PartialEq, Eq, PartialOrd, Ord] }, RotState = { derives = [PartialEq, Eq, PartialOrd, Ord] }, SpComponentCaboose = { derives = [PartialEq, Eq] }, SpIdentifier = { derives = [Copy, PartialEq, Hash, Eq] }, diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml index 56c3cc15622..9c47c14cf47 100644 --- a/nexus/mgs-updates/Cargo.toml +++ b/nexus/mgs-updates/Cargo.toml @@ -11,6 +11,7 @@ chrono.workspace = true futures.workspace = true gateway-client.workspace = true gateway-types.workspace = true +gateway-messages.workspace = true iddqd.workspace = true id-map.workspace = true internal-dns-resolver.workspace = true diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 7d74496d163..f4315c41a0c 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -267,7 +267,7 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>>; + ) -> BoxFuture<'a, Result<(), PostUpdateError>>; } /// Describes the live state of the component before the update begins @@ -319,12 +319,66 @@ pub enum PrecheckError { WrongInactiveVersion { expected: ExpectedVersion, found: FoundVersion }, } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] +pub enum PostUpdateError { + #[error("communicating with MGS")] + GatewayClientError(#[from] GatewayClientError), + + #[error("transient error: {message:?}")] + TransientError { message: String }, + + #[error("fatal error: {error:?}")] + FatalError { error: String }, +} + +impl PostUpdateError { + pub fn is_fatal(&self) -> bool { + match self { + PostUpdateError::GatewayClientError(error) => { + !matches!(error, gateway_client::Error::CommunicationError(_)) + } + PostUpdateError::TransientError { .. } => false, + PostUpdateError::FatalError { .. } => true, + } + } +} + +#[derive(Debug, Clone)] pub enum FoundVersion { MissingVersion, Version(String), } +impl FoundVersion { + pub fn matches( + &self, + expected: &ExpectedVersion, + ) -> Result<(), PrecheckError> { + match (expected, &self) { + // expected garbage, found garbage + (ExpectedVersion::NoValidVersion, FoundVersion::MissingVersion) => { + () + } + // expected a specific version and found it + ( + ExpectedVersion::Version(artifact_version), + FoundVersion::Version(found_version), + ) if artifact_version.to_string() == *found_version => (), + // anything else is a mismatch + (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) + | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) + | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { + return Err(PrecheckError::WrongInactiveVersion { + expected: expected.clone(), + found: self.clone(), + }); + } + }; + + Ok(()) + } +} + pub(crate) fn error_means_caboose_is_invalid( error: &GatewayClientError, ) -> bool { diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 05099532b06..5b55b970a4b 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -32,7 +32,37 @@ use uuid::Uuid; /// How long may the status remain unchanged without us treating this as a /// problem? -pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(120); +// +// Generally, this value covers two different things: +// +// 1. While we're uploading an image to the SP or it's being prepared, how long +// can the status stay the same before we give up altogether and try again? +// In practice, this would rarely pause for more than a few seconds. +// 2. The period where we might wait for an update to complete -- either our own +// update (in which case this is the period after the final device reset +// until the device comes up reporting the new version) or another instance's +// update (in which case this could cover almost the _entire_ update +// process). +// +// In both cases, if the timeout is reached, the whole update attempt will fail. +// This behavior is only intended to deal with pathological cases, like an MGS +// crash (which could cause an upload to hang indefinitely) or a Nexus crash +// (which could cause any update to hang indefinitely at any point). So we can +// afford to be generous here. Further, we really don't want to trip this +// erroneously in a working system because we're likely to get stuck continuing +// to retry and give up before each attempt finishes. +// +// In terms of sizing this timeout: +// - For all updates, the upload phase generally takes 10-20 seconds. +// - For SP updates, the post-reset phase can take about 30s (with Sidecar SPs +// being the longest). +// - For RoT and RoT bootloader updates, two resets and an intervening "set +// active slot" operation are required. Together, these could take just a +// few seconds. +// +// Adding all the above together, and giving ourselves plenty of margin, we +// choose 10 minutes. +pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(600); /// How long to wait between failed attempts to reset the device const RESET_DELAY_INTERVAL: Duration = Duration::from_secs(10); @@ -214,7 +244,7 @@ pub(crate) async fn apply_update( debug!(log, "loaded artifact contents"); // Check the live state first to see if: - // - this update has already been completed, or + // - this update has already been completed, // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); match update_helper.precheck(log, &mut mgs_clients, update).await { @@ -226,7 +256,6 @@ pub(crate) async fn apply_update( return Err(ApplyUpdateError::PreconditionFailed(error)); } }; - // Start the update. debug!(log, "ready to start update"); status.update(UpdateAttemptStatus::Updating); @@ -354,13 +383,13 @@ pub(crate) async fn apply_update( if try_reset { // We retry this until we get some error *other* than a communication - // error. There is intentionally no timeout here. If we've staged an - // update but not managed to reset the device, there's no point where - // we'd want to stop trying to do so. + // error or some other transient error. There is intentionally no + // timeout here. If we've staged an update but not managed to reset + // the device, there's no point where we'd want to stop trying to do so. while let Err(error) = update_helper.post_update(log, &mut mgs_clients, update).await { - if !matches!(error, gateway_client::Error::CommunicationError(_)) { + if error.is_fatal() { let error = InlineErrorChain::new(&error); error!(log, "post_update failed"; &error); return Err(ApplyUpdateError::SpResetFailed(error.to_string())); @@ -735,6 +764,91 @@ mod test { finished.expect_sp_success(expected_result); } + /// Tests several happy-path cases of updating an RoT bootloader + #[tokio::test] + async fn test_rot_bootloader_update_basic() { + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_rot_bootloader_update_basic", + SpPort::One, + ) + .await; + let log = &gwtestctx.logctx.log; + let artifacts = TestArtifacts::new(log).await.unwrap(); + + // Basic case: normal update + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Sled, + 1, + &artifacts.rot_bootloader_gimlet_artifact_hash, + UpdateCompletedHow::CompletedUpdate, + ) + .await; + + // Basic case: attempted update, found no changes needed + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Sled, + 1, + &artifacts.rot_bootloader_gimlet_artifact_hash, + UpdateCompletedHow::FoundNoChangesNeeded, + ) + .await; + + // Run the same two tests for a switch RoT bootloader. + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Switch, + 0, + &artifacts.rot_bootloader_sidecar_artifact_hash, + UpdateCompletedHow::CompletedUpdate, + ) + .await; + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Switch, + 0, + &artifacts.rot_bootloader_sidecar_artifact_hash, + UpdateCompletedHow::FoundNoChangesNeeded, + ) + .await; + + artifacts.teardown().await; + gwtestctx.teardown().await; + } + + async fn run_one_successful_rot_bootloader_update( + gwtestctx: &GatewayTestContext, + artifacts: &TestArtifacts, + sp_type: SpType, + slot_id: u16, + artifact_hash: &ArtifactHash, + expected_result: UpdateCompletedHow, + ) { + let desc = UpdateDescription { + gwtestctx, + artifacts, + sp_type, + slot_id, + artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + let in_progress = desc.setup().await; + let finished = in_progress.finish().await; + finished.expect_rot_bootloader_success(expected_result); + } + /// Tests several happy-path cases of updating an RoT #[tokio::test] async fn test_rot_update_basic() { @@ -1371,4 +1485,174 @@ mod test { gwtestctx.teardown().await; } + + /// Tests a bunch of easy fast-failure RoT bootloader cases. + #[tokio::test] + async fn test_rot_bootloader_basic_failures() { + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_rot_bootloader_basic_failures", + SpPort::One, + ) + .await; + let log = &gwtestctx.logctx.log; + let artifacts = TestArtifacts::new(log).await.unwrap(); + + // Test a case of mistaken identity (reported baseboard does not match + // the one that we expect). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: Some(BaseboardId { + part_number: String::from("i86pc"), + serial_number: String::from("SimGimlet0"), + }), + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "in sled slot 1, expected to find part \"i86pc\" serial \ + \"SimGimlet0\", but found part \"i86pc\" serial \ + \"SimGimlet01\"", + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where the active version doesn't match what we expect. + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: Some( + "not-right".parse().unwrap(), + ), + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find active version \"not-right\", but \ + found \"0.0.200\"" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where the inactive version doesn't match what it should + // (expected invalid, found something else). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: Some( + ExpectedVersion::NoValidVersion, + ), + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find inactive version NoValidVersion, \ + but found Version(\"0.0.200\")" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Now test a case where the inactive version doesn't match what it + // should (expected a different valid version). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: Some( + ExpectedVersion::Version( + "something-else".parse().unwrap(), + ), + ), + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find inactive version \ + Version(ArtifactVersion(\"something-else\")), but found \ + Version(\"0.0.200\")" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where we fail to fetch the artifact. We simulate this by + // tearing down our artifact server before the update starts. + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + let in_progress = desc.setup().await; + artifacts.teardown().await; + in_progress.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::FetchArtifact(..)); + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + gwtestctx.teardown().await; + } } diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 04aa569b0df..bf3a9a61eb4 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -6,35 +6,353 @@ use super::MgsClients; use crate::SpComponentUpdateHelper; +use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; +use crate::common_sp_update::error_means_caboose_is_invalid; +use futures::FutureExt; use futures::future::BoxFuture; +use gateway_client::SpComponent; +use gateway_client::types::GetRotBootInfoParams; +use gateway_client::types::RotImageError; +use gateway_client::types::RotState; +use gateway_client::types::SpComponentFirmwareSlot; +use gateway_client::types::SpType; +use gateway_messages::RotBootInfo; use nexus_types::deployment::PendingMgsUpdate; +use nexus_types::deployment::PendingMgsUpdateDetails; +use slog::Logger; +use slog::{debug, error, info}; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; +use std::time::Instant; -type GatewayClientError = gateway_client::Error; +const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(120); + +const WAIT_FOR_BOOT_INFO_INTERVAL: Duration = Duration::from_secs(10); pub struct ReconfiguratorRotBootloaderUpdater; impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { /// Checks if the component is already updated or ready for update fn precheck<'a>( &'a self, - _log: &'a slog::Logger, - _mgs_clients: &'a mut MgsClients, - _update: &'a PendingMgsUpdate, + log: &'a slog::Logger, + mgs_clients: &'a mut MgsClients, + update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result> { - // TODO-K: To be completed in a follow up PR - todo!() + async move { + // Verify that the device is the one we think it is. + let state = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client.sp_get(update.sp_type, update.slot_id).await + }) + .await? + .into_inner(); + debug!(log, "found SP state"; "state" => ?state); + if state.model != update.baseboard_id.part_number + || state.serial_number != update.baseboard_id.serial_number + { + return Err(PrecheckError::WrongDevice { + sp_type: update.sp_type, + slot_id: update.slot_id, + expected_part: update.baseboard_id.part_number.clone(), + expected_serial: update.baseboard_id.serial_number.clone(), + found_part: state.model, + found_serial: state.serial_number, + }); + } + + // Fetch the caboose from the currently active slot (stage0). + let caboose = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_caboose_get( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + 0, + ) + .await + }) + .await? + .into_inner(); + debug!(log, "found active slot caboose"; "caboose" => ?caboose); + + let found_stage0_version = caboose.version; + + // If the version in the currently active slot matches the one we're + // trying to set, then there's nothing to do. + if found_stage0_version == update.artifact_version.as_str() { + return Ok(PrecheckStatus::UpdateComplete); + } + + // Otherwise, if the version in the currently active slot does not + // match what we expect to find, bail out. It may be that somebody + // else has come along and completed a subsequent update and we + // don't want to roll that back. (If for some reason we *do* want + // to do this update, the planner will have to notice that what's + // here is wrong and update the blueprint.) + let PendingMgsUpdateDetails::RotBootloader { + expected_stage0_version, + expected_stage0_next_version, + } = &update.details + else { + unreachable!( + "pending MGS update details within \ + ReconfiguratorRotBootloaderUpdater will always be for the \ + RoT bootloader" + ); + }; + if found_stage0_version != expected_stage0_version.to_string() { + return Err(PrecheckError::WrongActiveVersion { + expected: expected_stage0_version.clone(), + found: found_stage0_version, + }); + } + + // For the same reason, check that the version in the inactive slot + // matches what we expect to find. + // TODO It's important for us to detect the condition that a caboose + // is invalid because this can happen when devices are programmed + // with a bad image. Unfortunately, MGS currently reports this as a + // 503. Besides being annoying for us to look for, this causes + // `try_all_serially()` to try the other MGS. That's pointless + // here, but not a big deal. + let found_stage0_next_caboose_result = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_caboose_get( + update.sp_type, + update.slot_id, + // The naming here is a bit confusing because "stage0" + // sometimes refers to the component (RoT bootloader) + // and sometimes refers to the active slot for that + // component. Here, we're accessing the inactive slot + // for it. The component is still "stage0". + &SpComponent::STAGE0.to_string(), + 1, + ) + .await + }) + .await; + let found_stage0_next_version = + match found_stage0_next_caboose_result { + Ok(version) => { + FoundVersion::Version(version.into_inner().version) + } + Err(error) => { + if error_means_caboose_is_invalid(&error) { + FoundVersion::MissingVersion + } else { + return Err(PrecheckError::from(error)); + } + } + }; + found_stage0_next_version + .clone() + .matches(&expected_stage0_next_version)?; + + Ok(PrecheckStatus::ReadyForUpdate) + } + .boxed() } /// Attempts once to perform any post-update actions (e.g., reset the /// device) fn post_update<'a>( &'a self, - _log: &'a slog::Logger, - _mgs_clients: &'a mut MgsClients, - _update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - // TODO-K: To be completed in a follow up PR - todo!() + log: &'a slog::Logger, + mgs_clients: &'a mut MgsClients, + update: &'a PendingMgsUpdate, + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + // To protect against bricking itself, the device will only activate + // a new image after it's been verified. Images are only verified at + // device boot time. Thus, we'll reset the device once to cause the + // signature to be verified. Then we can activate the new image and + // reset the device again. + debug!( + log, + "attempting to reset device to do bootloader signature check" + ); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await + }) + .await?; + + // We now retrieve boot info from the RoT to verify the reset + // has completed and signature checks done. + debug!( + log, + "attempting to retrieve boot info to verify image validity" + ); + let stage0next_error = wait_for_stage0_next_image_check( + log, + mgs_clients, + update.sp_type, + update.slot_id, + WAIT_FOR_BOOT_INFO_TIMEOUT, + ) + .await?; + // If boot info contains any error with the image loaded onto + // stage0_next, the device won't let us load this image onto stage0. + // We return a fatal error. + if let Some(e) = stage0next_error { + return Err(PostUpdateError::FatalError { + error: InlineErrorChain::new(&e).to_string(), + }); + } + + // This operation is very delicate. Here, we're overwriting the + // device bootloader with the one that we've written to the + // stage0next slot. + // The hardware has no fallback slot for the bootloader. So if the + // device resets or loses power while we're copying stage0next to + // the stage0 slot, it could still become bricked. + // + // We've already done everything we can to mitigate this: + // + // - The data is already on the device, minimizing the time to copy + // it to where it needs to go. + // - The image has already been verified by the device (and the + // device validates _that_ before starting this operation), so it + // won't fail at boot for that reason. + // - The device can't be externally reset _during_ this operation + // because the same code responsible for processing the reset + // request will be busy doing the copy. + // - We only ever update one RoT stage0 at a time in a rack, so if + // we brick one, only one sled would be affected (still bad). + // + // So we're ready to roll! + debug!(log, "attempting to set RoT bootloader active slot"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + let persist = true; + mgs_client + .sp_component_active_slot_set( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + persist, + &SpComponentFirmwareSlot { slot: 1 }, + ) + .await?; + Ok(()) + }) + .await?; + + debug!(log, "attempting to reset device to set to new RoT bootloader version"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await?; + Ok(()) + }) + .await?; + + Ok(()) + } + .boxed() + } +} + +/// Poll the RoT asking for its boot information. This is used to check +/// state after RoT bootloader updates +async fn wait_for_stage0_next_image_check( + log: &Logger, + mgs_clients: &mut MgsClients, + sp_type: SpType, + sp_slot: u16, + timeout: Duration, +) -> Result, PostUpdateError> { + let before = Instant::now(); + loop { + match mgs_clients + .try_all_serially(log, |mgs_client| async move { + mgs_client + .sp_rot_boot_info( + sp_type, + sp_slot, + SpComponent::ROT.const_as_str(), + &GetRotBootInfoParams { + version: RotBootInfo::HIGHEST_KNOWN_VERSION, + }, + ) + .await + }) + .await + { + Ok(state) => match state.into_inner() { + // The minimum we will ever return is v3. + // Additionally, V2 does not report image errors, so we cannot + // know with certainty if a signature check came back with errors + RotState::V2 { .. } => { + let error = "unexpected RoT version: 2".to_string(); + error!( + log, + "failed to get RoT boot info"; + "error" => &error + ); + return Err(PostUpdateError::FatalError { error }); + } + RotState::V3 { stage0next_error, .. } => { + return Ok(stage0next_error); + } + // The RoT is probably still booting + RotState::CommunicationFailed { message } => { + if before.elapsed() >= timeout { + error!( + log, + "failed to get RoT boot info"; + "error" => %message + ); + return Err(PostUpdateError::FatalError { + error: message, + }); + } + + info!( + log, + "failed getting RoT boot info (will retry)"; + "error" => %message, + ); + tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await; + } + }, + // The RoT might still be booting + Err(error) => { + let e = InlineErrorChain::new(&error); + if before.elapsed() >= timeout { + error!( + log, + "failed to get RoT boot info"; + &e, + ); + return Err(PostUpdateError::FatalError { + error: e.to_string(), + }); + } + + info!( + log, + "failed getting RoT boot info (will retry)"; + e, + ); + tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await; + } + } } } diff --git a/nexus/mgs-updates/src/rot_updater.rs b/nexus/mgs-updates/src/rot_updater.rs index 00a768f3e0d..cc73d14b844 100644 --- a/nexus/mgs-updates/src/rot_updater.rs +++ b/nexus/mgs-updates/src/rot_updater.rs @@ -11,6 +11,7 @@ use super::common_sp_update::SpComponentUpdater; use super::common_sp_update::deliver_update; use crate::SpComponentUpdateHelper; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::error_means_caboose_is_invalid; @@ -21,7 +22,6 @@ use gateway_client::types::RotState; use gateway_client::types::SpComponentFirmwareSlot; use gateway_client::types::SpType; use gateway_types::rot::RotSlot; -use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; @@ -345,27 +345,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { } } }; - match (&expected_inactive_version, &found_version) { - // expected garbage, found garbage - ( - ExpectedVersion::NoValidVersion, - FoundVersion::MissingVersion, - ) => (), - // expected a specific version and found it - ( - ExpectedVersion::Version(artifact_version), - FoundVersion::Version(found_version), - ) if artifact_version.to_string() == *found_version => (), - // anything else is a mismatch - (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) - | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) - | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { - return Err(PrecheckError::WrongInactiveVersion { - expected: expected_inactive_version.clone(), - found: found_version, - }); - } - }; + found_version.matches(expected_inactive_version)?; // If transient boot is being used, the persistent preference is not going to match // the active slot. At the moment, this mismatch can also mean one of the partitions @@ -396,33 +376,39 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - mgs_clients - .try_all_serially(log, move |mgs_client| async move { - // We want to set the slot we've just updated as the active one - debug!(log, "attempting to set active slot"); - let inactive_slot = match &update.details { - PendingMgsUpdateDetails::Rot { expected_active_slot, .. } => { - expected_active_slot.slot().toggled().to_u16() - }, - PendingMgsUpdateDetails::Sp { .. } - | PendingMgsUpdateDetails::RotBootloader { .. } => unreachable!( - "pending MGS update details within ReconfiguratorRotUpdater \ - will always be for the RoT" - ) - }; - let persist = true; - mgs_client - .sp_component_active_slot_set( - update.sp_type, - update.slot_id, - &SpComponent::ROT.to_string(), - persist, - &SpComponentFirmwareSlot { slot: inactive_slot } - ) - .await?; + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + // We want to set the slot we've just updated as the active one + debug!(log, "attempting to set active slot"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + let inactive_slot = match &update.details { + PendingMgsUpdateDetails::Rot { expected_active_slot, .. } => { + expected_active_slot.slot().toggled().to_u16() + }, + PendingMgsUpdateDetails::Sp { .. } + | PendingMgsUpdateDetails::RotBootloader { .. } => unreachable!( + "pending MGS update details within ReconfiguratorRotUpdater \ + will always be for the RoT" + ) + }; + let persist = true; + mgs_client + .sp_component_active_slot_set( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + persist, + &SpComponentFirmwareSlot { slot: inactive_slot } + ) + .await?; + Ok(()) + }) + .await?; - debug!(log, "attempting to reset device"); + debug!(log, "attempting to reset device"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { mgs_client .sp_component_reset( update.sp_type, @@ -431,7 +417,8 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { ) .await?; Ok(()) - }) - .boxed() + }).await?; + Ok(()) + }.boxed() } } diff --git a/nexus/mgs-updates/src/sp_updater.rs b/nexus/mgs-updates/src/sp_updater.rs index 7809bc2f090..158b1cf097d 100644 --- a/nexus/mgs-updates/src/sp_updater.rs +++ b/nexus/mgs-updates/src/sp_updater.rs @@ -9,6 +9,7 @@ use crate::SpComponentUpdateError; use crate::SpComponentUpdateHelper; use crate::UpdateProgress; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::SpComponentUpdater; @@ -18,7 +19,6 @@ use futures::FutureExt; use futures::future::BoxFuture; use gateway_client::SpComponent; use gateway_client::types::SpType; -use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; @@ -259,27 +259,7 @@ impl SpComponentUpdateHelper for ReconfiguratorSpUpdater { } } }; - match (&expected_inactive_version, &found_version) { - // expected garbage, found garbage - ( - ExpectedVersion::NoValidVersion, - FoundVersion::MissingVersion, - ) => (), - // expected a specific version and found it - ( - ExpectedVersion::Version(artifact_version), - FoundVersion::Version(found_version), - ) if artifact_version.to_string() == *found_version => (), - // anything else is a mismatch - (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) - | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) - | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { - return Err(PrecheckError::WrongInactiveVersion { - expected: expected_inactive_version.clone(), - found: found_version, - }); - } - }; + found_version.matches(expected_inactive_version)?; Ok(PrecheckStatus::ReadyForUpdate) } @@ -293,19 +273,23 @@ impl SpComponentUpdateHelper for ReconfiguratorSpUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - mgs_clients - .try_all_serially(log, move |mgs_client| async move { - debug!(log, "attempting to reset device"); - mgs_client - .sp_component_reset( - update.sp_type, - update.slot_id, - &SpComponent::SP_ITSELF.to_string(), - ) - .await?; - Ok(()) - }) - .boxed() + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + debug!(log, "attempting to reset device"); + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::SP_ITSELF.to_string(), + ) + .await?; + Ok(()) + }) + .await?; + Ok(()) + } + .boxed() } } diff --git a/nexus/mgs-updates/src/test_util/sp_test_state.rs b/nexus/mgs-updates/src/test_util/sp_test_state.rs index abdead68b5f..a484736a804 100644 --- a/nexus/mgs-updates/src/test_util/sp_test_state.rs +++ b/nexus/mgs-updates/src/test_util/sp_test_state.rs @@ -44,6 +44,16 @@ pub struct SpTestState { /// This can be None if the caboose contents were not valid. pub caboose_rot_b: Option, + /// caboose read from stage 0 (active slot for the RoT bootloader) + /// + /// This is not an `Option` because we never expect to fail to read this. + pub caboose_stage0: SpComponentCaboose, + + /// caboose read from stage 0 next (inactive slot for the RoT bootloader) + /// + /// This can be None if the caboose contents were not valid. + pub caboose_stage0_next: Option, + /// Overall SP state pub sp_state: SpState, @@ -94,6 +104,24 @@ impl SpTestState { ) .await .map(|c| c.into_inner()); + let caboose_stage0 = mgs_client + .sp_component_caboose_get( + sp_type, + sp_slot, + SpComponent::STAGE0.const_as_str(), + 0, + ) + .await? + .into_inner(); + let caboose_stage0_next = mgs_client + .sp_component_caboose_get( + sp_type, + sp_slot, + SpComponent::STAGE0.const_as_str(), + 1, + ) + .await + .map(|c| c.into_inner()); let sp_info = mgs_client.sp_get(sp_type, sp_slot).await?.into_inner(); let sp_boot_info = mgs_client .sp_rot_boot_info( @@ -113,6 +141,10 @@ impl SpTestState { ), caboose_rot_a: ignore_invalid_caboose_error(caboose_rot_a), caboose_rot_b: ignore_invalid_caboose_error(caboose_rot_b), + caboose_stage0, + caboose_stage0_next: ignore_invalid_caboose_error( + caboose_stage0_next, + ), sp_state: sp_info, sp_boot_info, }) @@ -141,6 +173,14 @@ impl SpTestState { } } + pub fn expect_caboose_stage0(&self) -> &SpComponentCaboose { + &self.caboose_stage0 + } + + pub fn expect_caboose_stage0_next(&self) -> &SpComponentCaboose { + self.caboose_stage0_next.as_ref().expect("stage0 next caboose") + } + pub fn expect_caboose_rot_inactive(&self) -> &SpComponentCaboose { let slot = self.expect_rot_active_slot().toggled(); match slot { @@ -255,6 +295,22 @@ impl SpTestState { }, } } + + pub fn expect_stage0_version(&self) -> ArtifactVersion { + self.expect_caboose_stage0() + .version + .parse() + .expect("valid artifact version") + } + + pub fn expect_stage0_next_version(&self) -> ExpectedVersion { + match &self.caboose_stage0_next { + Some(v) => ExpectedVersion::Version( + v.version.parse().expect("valid stage0 next version"), + ), + None => ExpectedVersion::NoValidVersion, + } + } } fn ignore_invalid_caboose_error( diff --git a/nexus/mgs-updates/src/test_util/test_artifacts.rs b/nexus/mgs-updates/src/test_util/test_artifacts.rs index 33cb4853009..4eb753f0c47 100644 --- a/nexus/mgs-updates/src/test_util/test_artifacts.rs +++ b/nexus/mgs-updates/src/test_util/test_artifacts.rs @@ -36,6 +36,8 @@ pub struct TestArtifacts { pub sp_sidecar_artifact_hash: ArtifactHash, pub rot_gimlet_artifact_hash: ArtifactHash, pub rot_sidecar_artifact_hash: ArtifactHash, + pub rot_bootloader_gimlet_artifact_hash: ArtifactHash, + pub rot_bootloader_sidecar_artifact_hash: ArtifactHash, pub artifact_cache: Arc, deployed_cabooses: BTreeMap, resolver: FixedResolver, @@ -108,12 +110,56 @@ impl TestArtifacts { ArtifactHash(digest.finalize().into()) }; + // Make an RoT bootloader update artifact for SimGimlet. + let rot_bootloader_gimlet_artifact_caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_GIMLET_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder + .write_caboose(rot_bootloader_gimlet_artifact_caboose.as_slice()) + .unwrap(); + let rot_bootloader_gimlet_artifact = builder.build_to_vec().unwrap(); + let rot_bootloader_gimlet_artifact_hash = { + let mut digest = sha2::Sha256::default(); + digest.update(&rot_bootloader_gimlet_artifact); + ArtifactHash(digest.finalize().into()) + }; + + // Make an RoT bootloader update artifact for SimSidecar + let rot_bootloader_sidecar_artifact_caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_SIDECAR_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder + .write_caboose(rot_bootloader_sidecar_artifact_caboose.as_slice()) + .unwrap(); + let rot_bootloader_sidecar_artifact = builder.build_to_vec().unwrap(); + let rot_bootloader_sidecar_artifact_hash = { + let mut digest = sha2::Sha256::default(); + digest.update(&rot_bootloader_sidecar_artifact); + ArtifactHash(digest.finalize().into()) + }; + // Assemble a map of artifact hash to artifact contents. let artifact_data = [ (sp_gimlet_artifact_hash, sp_gimlet_artifact), (sp_sidecar_artifact_hash, sp_sidecar_artifact), (rot_gimlet_artifact_hash, rot_gimlet_artifact), (rot_sidecar_artifact_hash, rot_sidecar_artifact), + ( + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_gimlet_artifact, + ), + ( + rot_bootloader_sidecar_artifact_hash, + rot_bootloader_sidecar_artifact, + ), ] .into_iter() .collect(); @@ -124,6 +170,14 @@ impl TestArtifacts { (sp_sidecar_artifact_hash, sp_sidecar_artifact_caboose), (rot_gimlet_artifact_hash, rot_gimlet_artifact_caboose), (rot_sidecar_artifact_hash, rot_sidecar_artifact_caboose), + ( + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_gimlet_artifact_caboose, + ), + ( + rot_bootloader_sidecar_artifact_hash, + rot_bootloader_sidecar_artifact_caboose, + ), ] .into_iter() .collect(); @@ -155,6 +209,8 @@ impl TestArtifacts { sp_sidecar_artifact_hash, rot_gimlet_artifact_hash, rot_sidecar_artifact_hash, + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_sidecar_artifact_hash, deployed_cabooses, artifact_cache, resolver, diff --git a/nexus/mgs-updates/src/test_util/updates.rs b/nexus/mgs-updates/src/test_util/updates.rs index 84bfb70a593..76b1dede6d5 100644 --- a/nexus/mgs-updates/src/test_util/updates.rs +++ b/nexus/mgs-updates/src/test_util/updates.rs @@ -54,9 +54,10 @@ pub enum ExpectedSpComponent { override_expected_pending_persistent_boot_preference: Option, override_expected_transient_boot_preference: Option, }, - // TODO-K: Remove once fully implemented - #[allow(dead_code)] - RotBootloader {}, + RotBootloader { + override_expected_stage0: Option, + override_expected_stage0_next: Option, + }, } /// Describes an update operation that can later be executed any number of times @@ -152,7 +153,23 @@ impl UpdateDescription<'_> { expected_transient_boot_preference, } } - ExpectedSpComponent::RotBootloader {} => unimplemented!(), + ExpectedSpComponent::RotBootloader { + override_expected_stage0, + override_expected_stage0_next, + } => { + let expected_stage0_version = override_expected_stage0 + .clone() + .unwrap_or_else(|| sp1.expect_stage0_version()); + let expected_stage0_next_version = + override_expected_stage0_next + .clone() + .unwrap_or_else(|| sp1.expect_stage0_next_version()); + + PendingMgsUpdateDetails::RotBootloader { + expected_stage0_version, + expected_stage0_next_version, + } + } }; let deployed_caboose = self @@ -492,6 +509,64 @@ impl FinishedUpdateAttempt { } } + /// Asserts various conditions associated with successful RoT bootloader updates. + pub fn expect_rot_bootloader_success( + &self, + expected_result: UpdateCompletedHow, + ) { + let how = match self.result { + Ok(how) if how == expected_result => how, + _ => { + panic!( + "unexpected result from apply_update(): {:?}", + self.result, + ); + } + }; + + eprintln!("apply_update() -> {:?}", how); + let sp2 = &self.sp2; + + // The active slot should contain what we just updated to. + let deployed_caboose = &self.deployed_caboose; + assert!(cabooses_equal(&sp2.caboose_stage0, &deployed_caboose)); + + // RoT information should not have changed. + let sp1 = &self.sp1; + assert_eq!(sp1.expect_caboose_rot_a(), sp2.expect_caboose_rot_a()); + assert_eq!(sp1.expect_caboose_rot_b(), sp2.expect_caboose_rot_b()); + + // SP information should not have changed + let sp1 = &self.sp1; + assert_eq!( + sp1.expect_caboose_sp_inactive(), + sp2.expect_caboose_sp_inactive(), + ); + assert_eq!( + sp1.expect_caboose_sp_active(), + sp2.expect_caboose_sp_active(), + ); + + if how == UpdateCompletedHow::FoundNoChangesNeeded { + assert_eq!(sp1.caboose_stage0, sp2.caboose_stage0); + assert_eq!( + sp1.expect_caboose_stage0_next(), + sp2.expect_caboose_stage0_next() + ); + assert_eq!(sp1.sp_state, sp2.sp_state); + assert_eq!(sp1.sp_boot_info, sp2.sp_boot_info); + } else { + // One way or another, an update was completed. Stage 0 + // and Stage 0 next should contain the same contents + assert_eq!( + sp2.expect_caboose_stage0(), + sp2.expect_caboose_stage0_next() + ); + // RoT boot info should have changed + assert_ne!(sp1.sp_boot_info, sp2.sp_boot_info); + } + } + /// Asserts that the update failed and invokes `assert_error(error, /// initial_sp_state, final_sp_state)` for you to make your own assertions /// about why it failed and what state things were left in.