diff --git a/Cargo.lock b/Cargo.lock index db010a4e20..4dd0b111cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1157,6 +1157,7 @@ dependencies = [ "drv-spartan7-loader-api", "drv-spi-api", "drv-stm32xx-sys-api", + "ereports", "fixedstr", "gnarle", "idol", @@ -1178,6 +1179,7 @@ name = "drv-cpu-power-state" version = "0.1.0" dependencies = [ "counters", + "microcbor", "num-traits", "userlib", "zerocopy 0.8.27", @@ -1325,6 +1327,7 @@ dependencies = [ "drv-spi-api", "drv-stm32h7-spi", "drv-stm32xx-sys-api", + "ereports", "fixedstr", "gnarle", "hubpack", @@ -2063,6 +2066,7 @@ dependencies = [ "drv-packrat-vpd-loader", "drv-psc-seq-api", "drv-stm32xx-sys-api", + "ereports", "fixedstr", "idol", "microcbor", @@ -2991,6 +2995,16 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "ereports" +version = "0.1.0" +dependencies = [ + "drv-cpu-power-state", + "drv-i2c-devices", + "fixedstr", + "microcbor", +] + [[package]] name = "errno" version = "0.2.8" diff --git a/drv/cosmo-seq-server/Cargo.toml b/drv/cosmo-seq-server/Cargo.toml index 38694a49ff..4405079d3f 100644 --- a/drv/cosmo-seq-server/Cargo.toml +++ b/drv/cosmo-seq-server/Cargo.toml @@ -16,6 +16,7 @@ drv-packrat-vpd-loader = { path = "../packrat-vpd-loader" } drv-spartan7-loader-api = { path = "../spartan7-loader-api" } drv-spi-api = { path = "../spi-api" } drv-stm32xx-sys-api = { path = "../stm32xx-sys-api" } +ereports = { path = "../../lib/ereports" } gnarle = { path = "../../lib/gnarle" } ringbuf = { path = "../../lib/ringbuf" } userlib = { path = "../../sys/userlib", features = ["panic-messages"] } diff --git a/drv/cosmo-seq-server/src/main.rs b/drv/cosmo-seq-server/src/main.rs index 4795dbd0d3..0496810bb2 100644 --- a/drv/cosmo-seq-server/src/main.rs +++ b/drv/cosmo-seq-server/src/main.rs @@ -24,6 +24,7 @@ use userlib::{ RecvMessage, }; +use crate::i2c_config::MAX_COMPONENT_ID_LEN as REFDES_LEN; use drv_hf_api::HostFlash; use ringbuf::{counted_ringbuf, ringbuf_entry, Count}; @@ -170,18 +171,12 @@ const SP5R4_PULL: sys_api::Pull = sys_api::Pull::None; use gpio_irq_pins::SEQ_IRQ; -//////////////////////////////////////////////////////////////////////////////// - /// Helper type which includes both sequencer and NIC state machine states struct StateMachineStates { seq: Result, nic: Result, } -const EREPORT_BUF_LEN: usize = microcbor::max_cbor_len_for!( - task_packrat_api::Ereport, -); - #[export_name = "main"] fn main() -> ! { // Populate packrat with our mac address and identity. @@ -195,6 +190,11 @@ fn main() -> ! { EREPORT_BUF.claim() }; + let mut ereporter = Ereporter { + packrat, + buf: ereport_buf, + }; + // // Apply the configuration mitigation on the BMR491, if required. This // is an external device access and may fail. We'll attempt it thrice @@ -219,21 +219,19 @@ fn main() -> ! { if let Some(last_cause) = last_cause { // Report the failure even if we eventually succeeded. - try_send_ereport( - &packrat, - &mut ereport_buf[..], - EreportClass::Bmr491MitigationFailure, - EreportKind::Bmr491MitigationFailure { - refdes: FixedStr::from_str(dev.component_id()), - failures, - last_cause, - succeeded, - }, - ); + let ereport = ereports::pwr::Bmr491MitigationFailure { + refdes: FixedStr::<{ REFDES_LEN }>::from_str( + dev.component_id(), + ), + failures, + last_cause, + succeeded, + }; + ereporter.try_send_ereport(&ereport); } } - match init(packrat, ereport_buf) { + match init(ereporter) { // Set up everything nicely, time to start serving incoming messages. Ok(mut server) => { // Enable the backplane PCIe clock if requested @@ -275,10 +273,7 @@ fn main() -> ! { } } -fn init( - packrat: Packrat, - ereport_buf: &'static mut [u8; EREPORT_BUF_LEN], -) -> Result { +fn init(ereporter: Ereporter) -> Result { let sys = sys_api::Sys::from(SYS.get_task_id()); // Pull the fault line low while we're loading @@ -369,7 +364,7 @@ fn init( // Turn on the chassis LED! sys.gpio_set(SP_CHASSIS_STATUS_LED); - Ok(ServerImpl::new(loader, packrat, ereport_buf)) + Ok(ServerImpl::new(loader, ereporter)) } /// Configures the front FPGA pins and holds it in reset @@ -443,6 +438,8 @@ fn init_front_fpga( #[allow(unused)] struct ServerImpl { state: PowerState, + /// The Hubris tick at which we transitioned to the current state. + since: u64, jefe: Jefe, sys: Sys, hf: HostFlash, @@ -450,52 +447,13 @@ struct ServerImpl { espi: fmc_periph::espi::Espi, debug: fmc_periph::debug_ctrl::DebugCtrl, vcore: VCore, - /// Static buffer for encoding ereports. This is a static so that we don't - /// have it on the stack when encoding ereports. - ereport_buf: &'static mut [u8; EREPORT_BUF_LEN], -} - -#[derive(microcbor::Encode)] -pub enum EreportClass { - #[cbor(rename = "hw.pwr.pmbus.alert")] - PmbusAlert, - #[cbor(rename = "hw.pwr.bmr491.mitfail")] - Bmr491MitigationFailure, -} - -#[derive(microcbor::EncodeFields)] -pub(crate) enum EreportKind { - Bmr491MitigationFailure { - refdes: FixedStr<'static, { crate::i2c_config::MAX_COMPONENT_ID_LEN }>, - failures: u32, - last_cause: drv_i2c_devices::bmr491::MitigationFailureKind, - succeeded: bool, - }, - PmbusAlert { - refdes: FixedStr<'static, { crate::i2c_config::MAX_COMPONENT_ID_LEN }>, - rail: vcore::Rail, - time: u64, - pwr_good: Option, - pmbus_status: PmbusStatus, - }, -} - -#[derive(Copy, Clone, Default, microcbor::Encode)] -pub(crate) struct PmbusStatus { - word: Option, - input: Option, - iout: Option, - vout: Option, - temp: Option, - cml: Option, - mfr: Option, + ereporter: Ereporter, } impl ServerImpl { fn new( loader: drv_spartan7_loader_api::Spartan7Loader, - packrat: Packrat, - ereport_buf: &'static mut [u8; EREPORT_BUF_LEN], + ereporter: Ereporter, ) -> Self { let now = sys_get_timer().now; @@ -517,14 +475,15 @@ impl ServerImpl { ServerImpl { state: PowerState::A2, + since: now, jefe, sys: Sys::from(SYS.get_task_id()), hf: HostFlash::from(HF.get_task_id()), seq, espi, debug, - vcore: VCore::new(I2C.get_task_id(), packrat), - ereport_buf, + vcore: VCore::new(I2C.get_task_id()), + ereporter, } } @@ -609,6 +568,11 @@ impl ServerImpl { if !present { ringbuf_entry!(Trace::CPUNotPresent); + self.ereporter.try_send_ereport( + &ereports::cpu::CpuMissing { + cpu: &HOST_CPU_REFDES, + }, + ); err = CpuSeqError::CPUNotPresent; break; } @@ -648,26 +612,35 @@ impl ServerImpl { }); // From sp5-mobo-guide-56870_1.1.pdf table 72 - match (coretype0, coretype1, coretype2) { + let coretype_ok = match (coretype0, coretype1, coretype2) { // These correspond to Type-2 and Type-3 - (true, false, true) | (true, false, false) => (), + (true, false, true) | (true, false, false) => true, // Reject all other combos and return to A0 - _ => { - self.seq.power_ctrl.modify(|m| m.set_a0_en(false)); - return Err(CpuSeqError::UnrecognizedCPU); - } + _ => false, }; // From sp5-mobo-guide-56870_1.1.pdf table 73 - match (sp5r1, sp5r2, sp5r3, sp5r4) { + let sp5rx_ok = // There is only combo we accept here - (true, false, false, false) => (), - // Reject all other combos and return to A0 - _ => { - self.seq.power_ctrl.modify(|m| m.set_a0_en(false)); - return Err(CpuSeqError::UnrecognizedCPU); - } - }; + (sp5r1, sp5r2, sp5r3, sp5r4) == (true, false, false, false); + + if !(coretype_ok && sp5rx_ok) { + self.seq.power_ctrl.modify(|m| m.set_a0_en(false)); + let ereport = ereports::cpu::UnsupportedCpu { + cpu: &HOST_CPU_REFDES, + coretype: ereports::cpu::CpuTypeBits { + bits: [coretype0, coretype1, coretype2], + ok: coretype_ok, + }, + rev: ereports::cpu::CpuTypeBits { + bits: [sp5r1, sp5r2, sp5r3, sp5r4], + ok: sp5rx_ok, + }, + }; + self.ereporter.try_send_ereport(&ereport); + return Err(CpuSeqError::UnrecognizedCPU); + } + // Turn on the voltage regulator undervolt alerts. self.enable_sequencer_interrupts(); @@ -727,13 +700,14 @@ impl ServerImpl { _ => return Err(CpuSeqError::IllegalTransition), } - self.set_state_internal(state); + self.set_state_internal(state, now); Ok(Transition::Changed) } /// Updates our internal `state` and the global state in `jefe` - fn set_state_internal(&mut self, state: PowerState) { + fn set_state_internal(&mut self, state: PowerState, now: u64) { self.state = state; + self.since = now; self.jefe.set_state(state as u32); self.poke_timer(); } @@ -891,7 +865,7 @@ impl ServerImpl { pwr_cont2: ifr.pwr_cont2_to_fpga1_alert, }; self.vcore - .handle_pmbus_alert(which_vrms, now, self.ereport_buf); + .handle_pmbus_alert(which_vrms, now, &mut self.ereporter); // We need not instruct the sequencer to reset. PMBus alerts from // the RAA229620As are divided into two categories, "warnings" and @@ -962,7 +936,10 @@ impl ServerImpl { self.seq.ifr.modify(|h| h.set_thermtrip(true)); ringbuf_entry!(Trace::Thermtrip); action = InternalAction::ThermTrip; - // Great place for an ereport? + self.ereporter.try_send_ereport(&ereports::cpu::Thermtrip { + cpu: &HOST_CPU_REFDES, + state: self.ereport_current_state(), + }); } if ifr.a0mapo { @@ -977,7 +954,10 @@ impl ServerImpl { self.seq.ifr.modify(|h| h.set_smerr_assert(true)); ringbuf_entry!(Trace::SmerrInterrupt); action = InternalAction::Smerr; - // Great place for an ereport? + self.ereporter.try_send_ereport(&ereports::cpu::Smerr { + cpu: &HOST_CPU_REFDES, + state: self.ereport_current_state(), + }); } // Fan Fault is unconnected @@ -992,7 +972,7 @@ impl ServerImpl { why: StateChangeReason::CpuReset, now, }); - self.set_state_internal(PowerState::A0Reset); + self.set_state_internal(PowerState::A0Reset, now); } InternalAction::NicMapo => { // Presumably we are in A0+HP, so send us back to A0 so that the @@ -1004,7 +984,7 @@ impl ServerImpl { why: StateChangeReason::NicMapo, now, }); - self.set_state_internal(PowerState::A0); + self.set_state_internal(PowerState::A0, now); } InternalAction::ThermTrip => { // This is a terminal state; we set our state to `A0Thermtrip` @@ -1015,7 +995,7 @@ impl ServerImpl { why: StateChangeReason::Overheat, now, }); - self.set_state_internal(PowerState::A0Thermtrip); + self.set_state_internal(PowerState::A0Thermtrip, now); } InternalAction::Mapo => { // This is a terminal state (for now) @@ -1038,6 +1018,13 @@ impl ServerImpl { fn is_seq_irq_asserted(&self) -> bool { self.sys.gpio_read(SEQ_IRQ) == 0 } + + fn ereport_current_state(&self) -> ereports::pwr::CurrentState { + ereports::pwr::CurrentState { + cur: self.state, + since_ms: self.since, + } + } } impl idl::InOrderSequencerImpl for ServerImpl { @@ -1247,27 +1234,47 @@ impl NotificationHandler for ServerImpl { } } -fn try_send_ereport( - packrat: &task_packrat_api::Packrat, - ereport_buf: &mut [u8], - class: EreportClass, - report: EreportKind, -) { - let eresult = packrat.deliver_microcbor_ereport( - &task_packrat_api::Ereport { - class, - version: 0, - report, - }, - ereport_buf, - ); - match eresult { - Ok(len) => ringbuf_entry!(Trace::EreportSent(len)), - Err(task_packrat_api::EreportEncodeError::Packrat { len, err }) => { - ringbuf_entry!(Trace::EreportLost(len, err)) - } - Err(task_packrat_api::EreportEncodeError::Encoder(_)) => { - ringbuf_entry!(Trace::EreportTooBig) +//////////////////////////////////////////////////////////////////////////////// + +const EREPORT_BUF_LEN: usize = microcbor::max_cbor_len_for![ + ereports::pwr::PmbusAlert, + ereports::pwr::Bmr491MitigationFailure<{ REFDES_LEN }>, + ereports::cpu::Thermtrip, + ereports::cpu::Smerr, + ereports::cpu::UnsupportedCpu<3, 4>, + ereports::cpu::CpuMissing, +]; + +static HOST_CPU_REFDES: ereports::cpu::HostCpuRefdes = + ereports::cpu::HostCpuRefdes { + refdes: fixedstr::FixedString::from_str("P0"), + dev_id: fixedstr::FixedString::from_str("sp5-host-cpu"), + }; + +/// This is just the Packrat API handle and the ereport buffer bundled together +/// in one thing so that it can be passed into various places as a single +/// argument. +pub(crate) struct Ereporter { + packrat: task_packrat_api::Packrat, + buf: &'static mut [u8; EREPORT_BUF_LEN], +} + +impl Ereporter { + pub(crate) fn try_send_ereport( + &mut self, + ereport: &impl microcbor::StaticCborLen, + ) { + let eresult = self + .packrat + .deliver_microcbor_ereport(&ereport, &mut self.buf[..]); + match eresult { + Ok(len) => ringbuf_entry!(Trace::EreportSent(len)), + Err(task_packrat_api::EreportEncodeError::Packrat { len, err }) => { + ringbuf_entry!(Trace::EreportLost(len, err)) + } + Err(task_packrat_api::EreportEncodeError::Encoder(_)) => { + ringbuf_entry!(Trace::EreportTooBig) + } } } } diff --git a/drv/cosmo-seq-server/src/vcore.rs b/drv/cosmo-seq-server/src/vcore.rs index e514bed71b..4f4b18f6c3 100644 --- a/drv/cosmo-seq-server/src/vcore.rs +++ b/drv/cosmo-seq-server/src/vcore.rs @@ -13,8 +13,10 @@ //! use super::i2c_config; +use crate::Ereporter; use drv_i2c_api::ResponseCode; use drv_i2c_devices::raa229620a::{self, Raa229620A}; +use ereports::pwr::{PmbusAlert, PmbusStatus}; use fixedstr::FixedStr; use pmbus::commands::raa229620a::STATUS_WORD; use ringbuf::*; @@ -26,7 +28,6 @@ pub(super) struct VCore { /// `PWR_CONT2`: This regulator controls `VDDCR_CPU1` and `VDDIO_SP5` rails. vddcr_cpu1: Raa229620A, faulted: Vrms, - packrat: task_packrat_api::Packrat, } #[derive(Copy, Clone, PartialEq, microcbor::Encode)] @@ -176,7 +177,7 @@ const VCORE_UV_WARN_LIMIT: units::Volts = units::Volts(11.0); const VCORE_NSAMPLES: usize = 25; impl VCore { - pub fn new(i2c: TaskId, packrat: task_packrat_api::Packrat) -> Self { + pub fn new(i2c: TaskId) -> Self { let (device, rail) = i2c_config::pmbus::vddcr_cpu0_a0(i2c); let vddcr_cpu0 = Raa229620A::new(&device, rail); @@ -190,7 +191,6 @@ impl VCore { pwr_cont1: false, pwr_cont2: false, }, - packrat, } } @@ -268,7 +268,7 @@ impl VCore { &mut self, vrms: Vrms, now: u64, - ereport_buf: &mut [u8], + ereporter: &mut Ereporter, ) { ringbuf_entry!(Trace::PmbusAlert { timestamp: now, @@ -281,7 +281,7 @@ impl VCore { now, Rail::VddcrCpu0, vrms.pwr_cont1, - ereport_buf, + ereporter, ); input_fault |= state.input_fault; self.faulted.pwr_cont1 |= state.faulted; @@ -292,7 +292,7 @@ impl VCore { now, Rail::VddcrCpu1, vrms.pwr_cont1, - ereport_buf, + ereporter, ); input_fault |= state.input_fault; self.faulted.pwr_cont2 |= state.faulted; @@ -350,7 +350,7 @@ impl VCore { now: u64, rail: Rail, alerted: bool, - ereport_buf: &mut [u8], + ereporter: &mut Ereporter, ) -> RegulatorState { use pmbus::commands::raa229620a::STATUS_WORD; @@ -458,7 +458,7 @@ impl VCore { .map(|s| s.0); ringbuf_entry!(Trace::StatusMfrSpecific(rail, status_mfr)); - let pmbus_status = crate::PmbusStatus { + let pmbus_status = PmbusStatus { word: status_word.map(|s| s.0).ok(), input: status_input.ok(), vout: status_vout.ok(), @@ -468,19 +468,16 @@ impl VCore { mfr: status_mfr.ok(), }; - let ereport = crate::EreportKind::PmbusAlert { + let ereport = PmbusAlert { rail, - refdes: FixedStr::from_str(device.i2c_device().component_id()), + refdes: FixedStr::<{ crate::REFDES_LEN }>::from_str( + device.i2c_device().component_id(), + ), time: now, pmbus_status, pwr_good: power_good, }; - crate::try_send_ereport( - &self.packrat, - ereport_buf, - crate::EreportClass::PmbusAlert, - ereport, - ); + ereporter.try_send_ereport(&ereport); // TODO(eliza): if POWER_GOOD has been deasserted, we should produce a // subsequent ereport for that. diff --git a/drv/cpu-power-state/Cargo.toml b/drv/cpu-power-state/Cargo.toml index 58fcd88618..badb1f7938 100644 --- a/drv/cpu-power-state/Cargo.toml +++ b/drv/cpu-power-state/Cargo.toml @@ -9,6 +9,7 @@ zerocopy = { workspace = true } zerocopy-derive = { workspace = true } num-traits = { workspace = true } counters = { path = "../../lib/counters", features = ["derive"] } +microcbor = { path = "../../lib/microcbor", optional = true } [lib] test = false diff --git a/drv/cpu-power-state/src/lib.rs b/drv/cpu-power-state/src/lib.rs index 657cdf9180..87bf9e34f9 100644 --- a/drv/cpu-power-state/src/lib.rs +++ b/drv/cpu-power-state/src/lib.rs @@ -22,6 +22,7 @@ use zerocopy::{Immutable, IntoBytes, KnownLayout}; KnownLayout, counters::Count, )] +#[cfg_attr(feature = "microcbor", derive(microcbor::Encode))] #[repr(u8)] pub enum PowerState { /// Initial A2 state where the SP and most associated circuitry is powered. diff --git a/drv/gimlet-seq-server/Cargo.toml b/drv/gimlet-seq-server/Cargo.toml index ed3cb41f9d..fdf8bcf295 100644 --- a/drv/gimlet-seq-server/Cargo.toml +++ b/drv/gimlet-seq-server/Cargo.toml @@ -15,6 +15,7 @@ drv-spi-api = { path = "../spi-api" } drv-stm32h7-spi = { path = "../stm32h7-spi" } drv-stm32xx-sys-api = { path = "../stm32xx-sys-api" } counters = { path = "../../lib/counters" } +ereports = { path = "../../lib/ereports" } gnarle = { path = "../../lib/gnarle" } ringbuf = { path = "../../lib/ringbuf" } task-jefe-api = { path = "../../task/jefe-api" } diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index ba96a6b0a4..fac443ec87 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -19,6 +19,7 @@ use userlib::{ }; use zerocopy::IntoBytes; +use crate::i2c_config::MAX_COMPONENT_ID_LEN as REFDES_LEN; use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason, Transition}; use drv_hf_api as hf_api; use drv_i2c_api as i2c; @@ -202,58 +203,18 @@ fn main() -> ! { struct ServerImpl { state: PowerState, + /// The Hubris tick at which we transitioned to the current state. + since: u64, sys: sys_api::Sys, seq: seq_spi::SequencerFpga, jefe: Jefe, hf: hf_api::HostFlash, vcore: vcore::VCore, deadline: u64, - // Buffer for encoding ereports. This is a static so that it's not on the - // stack when handling interrupts. - ereport_buf: &'static mut [u8; EREPORT_BUF_LEN], + ereporter: Ereporter, } const TIMER_INTERVAL: u32 = 10; -const EREPORT_BUF_LEN: usize = microcbor::max_cbor_len_for!( - task_packrat_api::Ereport -); - -#[derive(microcbor::Encode)] -pub enum EreportClass { - #[cbor(rename = "hw.pwr.pmbus.alert")] - PmbusAlert, - #[cbor(rename = "hw.pwr.bmr491.mitfail")] - Bmr491MitigationFailure, -} - -#[derive(microcbor::EncodeFields)] -pub(crate) enum EreportKind { - PmbusAlert { - refdes: FixedStr<'static, { crate::i2c_config::MAX_COMPONENT_ID_LEN }>, - // 9 is the maximum length rail name used in this module (`VDD_VCORE`) - rail: FixedStr<'static, 9>, - time: u64, - pwr_good: Option, - pmbus_status: PmbusStatus, - }, - Bmr491MitigationFailure { - refdes: FixedStr<'static, { crate::i2c_config::MAX_COMPONENT_ID_LEN }>, - failures: u32, - last_cause: drv_i2c_devices::bmr491::MitigationFailureKind, - succeeded: bool, - }, -} - -#[derive(Copy, Clone, Default, microcbor::Encode)] -pub(crate) struct PmbusStatus { - word: Option, - input: Option, - iout: Option, - vout: Option, - temp: Option, - cml: Option, - mfr: Option, -} impl ServerImpl { fn init( @@ -490,13 +451,16 @@ impl ServerImpl { hl::sleep_for(1); } - let ereport_buf = { + let packrat = Packrat::from(PACKRAT.get_task_id()); + let mut ereporter = { use static_cell::ClaimOnceCell; static EREPORT_BUF: ClaimOnceCell<[u8; EREPORT_BUF_LEN]> = ClaimOnceCell::new([0; EREPORT_BUF_LEN]); - EREPORT_BUF.claim() + Ereporter { + buf: EREPORT_BUF.claim(), + packrat: packrat.clone(), + } }; - let packrat = Packrat::from(PACKRAT.get_task_id()); // // Apply the configuration mitigation on the BMR491, if required. This @@ -523,17 +487,15 @@ impl ServerImpl { if let Some(last_cause) = last_cause { // Report the failure even if we eventually succeeded. - try_send_ereport( - &packrat, - &mut ereport_buf[..], - EreportClass::Bmr491MitigationFailure, - EreportKind::Bmr491MitigationFailure { - refdes: FixedStr::from_str(dev.component_id()), - failures, - last_cause, - succeeded, - }, - ); + let ereport = ereports::pwr::Bmr491MitigationFailure { + refdes: FixedStr::<{ REFDES_LEN }>::from_str( + dev.component_id(), + ), + failures, + last_cause, + succeeded, + }; + ereporter.try_send_ereport(&ereport); } } @@ -581,13 +543,14 @@ impl ServerImpl { let mut server = Self { state: PowerState::A2, + since: 0, // we have been in A2 since we booted :) sys: sys.clone(), seq, jefe, hf, deadline: 0, - vcore: vcore::VCore::new(sys, packrat, &device, rail), - ereport_buf, + vcore: vcore::VCore::new(sys, &device, rail), + ereporter, }; // Power on, unless suppressed by the `stay-in-a2` feature @@ -627,7 +590,7 @@ impl NotificationHandler for ServerImpl { fn handle_notification(&mut self, bits: userlib::NotificationBits) { if bits.check_notification_mask(self.vcore.mask()) { - self.vcore.handle_notification(self.ereport_buf); + self.vcore.handle_notification(&mut self.ereporter); } if !bits.has_timer_fired(notifications::TIMER_MASK) { @@ -643,6 +606,7 @@ impl NotificationHandler for ServerImpl { }); if self.state == PowerState::A0 || self.state == PowerState::A0PlusHP { + let now = sys_get_timer().now; // // The first order of business is to check if sequencer saw a // falling edge on PWROK (denoting a reset) or a THERMTRIP. If it @@ -650,8 +614,8 @@ impl NotificationHandler for ServerImpl { // if both are indicated, we will clear both conditions -- but // land in A0Thermtrip). // - self.check_reset(ifr); - self.check_thermtrip(ifr); + self.check_reset(ifr, now); + self.check_thermtrip(ifr, now); // // Now we need to check NIC_PWREN_L to assure that our power state @@ -669,7 +633,7 @@ impl NotificationHandler for ServerImpl { self.seq .clear_bytes(Addr::NIC_CTRL, &[cld_rst]) .unwrap_lite(); - self.update_state_internal(PowerState::A0PlusHP); + self.update_state_internal(PowerState::A0PlusHP, now); } (PowerState::A0PlusHP, true) => { @@ -701,7 +665,10 @@ impl NotificationHandler for ServerImpl { self.seq .set_bytes(Addr::NIC_CTRL, &[cld_rst]) .unwrap_lite(); - self.update_state_internal(PowerState::A0); + self.update_state_internal( + PowerState::A0, + sys_get_timer().now, + ); } (PowerState::A0, true) | (PowerState::A0PlusHP, false) => { @@ -772,8 +739,9 @@ where } impl ServerImpl { - fn update_state_internal(&mut self, state: PowerState) { + fn update_state_internal(&mut self, state: PowerState, now: u64) { ringbuf_entry!(Trace::UpdateState(state)); + self.since = now; self.state = state; self.jefe.set_state(state as u32); } @@ -891,6 +859,11 @@ impl ServerImpl { ringbuf_entry!(Trace::CPUPresent(present)); if !present { + self.ereporter.try_send_ereport( + &ereports::cpu::CpuMissing { + cpu: &HOST_CPU_REFDES, + }, + ); return Err(self.a0_failure(SeqError::CPUNotPresent)); } @@ -910,7 +883,21 @@ impl ServerImpl { // be high (not connected on Type-0/Type-1/Type-2), and SP3R2 // to be low (VSS on Type-0/Type-1/Type-2). // - if !coretype || !sp3r1 || sp3r2 { + let rev_ok = sp3r1 && !sp3r2; + if !coretype || !rev_ok { + self.ereporter.try_send_ereport( + &ereports::cpu::UnsupportedCpu { + cpu: &HOST_CPU_REFDES, + coretype: ereports::cpu::CpuTypeBits { + bits: [coretype], + ok: coretype, + }, + rev: ereports::cpu::CpuTypeBits { + bits: [sp3r1, sp3r2], + ok: rev_ok, + }, + }, + ); return Err(self.a0_failure(SeqError::UnrecognizedCPU)); } @@ -990,11 +977,10 @@ impl ServerImpl { // Using wrapping_sub here because the timer is monotonic, so // we, the programmers, know that now > start. rustc, the // compiler, is not aware of this. - ringbuf_entry!(Trace::A0( - (sys_get_timer().now.wrapping_sub(start)) as u16 - )); + let now = sys_get_timer().now; + ringbuf_entry!(Trace::A0((now.wrapping_sub(start)) as u16)); - self.update_state_internal(PowerState::A0); + self.update_state_internal(PowerState::A0, now); Ok(Transition::Changed) } @@ -1040,7 +1026,7 @@ impl ServerImpl { return Err(SeqError::MuxToSPFailed); } - self.update_state_internal(PowerState::A2); + self.update_state_internal(PowerState::A2, sys_get_timer().now); ringbuf_entry_v3p3_sys_a0_vout(); ringbuf_entry!(Trace::A2); @@ -1117,12 +1103,16 @@ impl ServerImpl { // seen it (and knowing that the FPGA has already taken care of the // time-critical bits to assure that we don't melt!). // - fn check_thermtrip(&mut self, ifr: u8) { + fn check_thermtrip(&mut self, ifr: u8, now: u64) { let thermtrip = Reg::IFR::THERMTRIP; if ifr & thermtrip != 0 { self.seq.clear_bytes(Addr::IFR, &[thermtrip]).unwrap_lite(); - self.update_state_internal(PowerState::A0Thermtrip); + self.ereporter.try_send_ereport(&ereports::cpu::Thermtrip { + cpu: &HOST_CPU_REFDES, + state: self.ereport_current_state(), + }); + self.update_state_internal(PowerState::A0Thermtrip, now); } } @@ -1134,7 +1124,7 @@ impl ServerImpl { // of RESET_L. If we have seen a host reset, we send ourselves to // A0Reset. // - fn check_reset(&mut self, ifr: u8) { + fn check_reset(&mut self, ifr: u8, now: u64) { let pwrok_fedge = Reg::IFR::AMD_PWROK_FEDGE; if ifr & pwrok_fedge != 0 { @@ -1158,7 +1148,7 @@ impl ServerImpl { let mask = pwrok_fedge | Reg::IFR::AMD_RSTN_FEDGE; self.seq.clear_bytes(Addr::IFR, &[mask]).unwrap_lite(); - self.update_state_internal(PowerState::A0Reset); + self.update_state_internal(PowerState::A0Reset, now); } } @@ -1179,6 +1169,13 @@ impl ServerImpl { _ => None, } } + + fn ereport_current_state(&self) -> ereports::pwr::CurrentState { + ereports::pwr::CurrentState { + cur: self.state, + since_ms: self.since, + } + } } impl idl::InOrderSequencerImpl for ServerImpl { @@ -1628,31 +1625,52 @@ cfg_if::cfg_if! { } } -fn try_send_ereport( - packrat: &packrat_api::Packrat, - ereport_buf: &mut [u8], - class: EreportClass, - report: EreportKind, -) { - let eresult = packrat.deliver_microcbor_ereport( - &packrat_api::Ereport { - class, - version: 0, - report, - }, - ereport_buf, - ); - match eresult { - Ok(len) => ringbuf_entry!(Trace::EreportSent(len)), - Err(task_packrat_api::EreportEncodeError::Packrat { len, err }) => { - ringbuf_entry!(Trace::EreportLost(len, err)) - } - Err(task_packrat_api::EreportEncodeError::Encoder(_)) => { - ringbuf_entry!(Trace::EreportTooBig) +//////////////////////////////////////////////////////////////////////////////// + +const EREPORT_BUF_LEN: usize = microcbor::max_cbor_len_for![ + ereports::pwr::PmbusAlert, { REFDES_LEN }>, + ereports::pwr::Bmr491MitigationFailure<{ REFDES_LEN }>, + ereports::cpu::Thermtrip, + ereports::cpu::UnsupportedCpu<1, 2>, + ereports::cpu::CpuMissing, +]; + +static HOST_CPU_REFDES: ereports::cpu::HostCpuRefdes = + ereports::cpu::HostCpuRefdes { + refdes: fixedstr::FixedString::from_str("P0"), + dev_id: fixedstr::FixedString::from_str("sp3-host-cpu"), + }; + +/// This is just the Packrat API handle and the ereport buffer bundled together +/// in one thing so that it can be passed into various places as a single +/// argument. +pub(crate) struct Ereporter { + packrat: task_packrat_api::Packrat, + buf: &'static mut [u8; EREPORT_BUF_LEN], +} + +impl Ereporter { + pub(crate) fn try_send_ereport( + &mut self, + ereport: &impl microcbor::StaticCborLen, + ) { + let eresult = self + .packrat + .deliver_microcbor_ereport(&ereport, &mut self.buf[..]); + match eresult { + Ok(len) => ringbuf_entry!(Trace::EreportSent(len)), + Err(task_packrat_api::EreportEncodeError::Packrat { len, err }) => { + ringbuf_entry!(Trace::EreportLost(len, err)) + } + Err(task_packrat_api::EreportEncodeError::Encoder(_)) => { + ringbuf_entry!(Trace::EreportTooBig) + } } } } +//////////////////////////////////////////////////////////////////////////////// + mod idl { use super::StateChangeReason; diff --git a/drv/gimlet-seq-server/src/vcore.rs b/drv/gimlet-seq-server/src/vcore.rs index 6f8a61866f..0f68ec90db 100644 --- a/drv/gimlet-seq-server/src/vcore.rs +++ b/drv/gimlet-seq-server/src/vcore.rs @@ -28,20 +28,20 @@ use super::{retry_i2c_txn, I2cTxn}; /// A2 to A0 transition to clear faults. /// use crate::gpio_irq_pins::VCORE_TO_SP_ALERT_L; +use crate::Ereporter; use drv_i2c_api::{I2cDevice, ResponseCode}; use drv_i2c_devices::raa229618::Raa229618; use drv_stm32xx_sys_api as sys_api; +use ereports::pwr::{PmbusAlert, PmbusStatus}; use fixedstr::FixedStr; use ringbuf::*; use sys_api::IrqControl; -use task_packrat_api as packrat_api; use userlib::{sys_get_timer, units}; pub struct VCore { device: Raa229618, faulted: bool, sys: sys_api::Sys, - packrat: packrat_api::Packrat, } #[derive(Copy, Clone, PartialEq)] @@ -135,17 +135,11 @@ cfg_if::cfg_if! { } impl VCore { - pub fn new( - sys: &sys_api::Sys, - packrat: packrat_api::Packrat, - device: &I2cDevice, - rail: u8, - ) -> Self { + pub fn new(sys: &sys_api::Sys, device: &I2cDevice, rail: u8) -> Self { Self { device: Raa229618::new(device, rail), faulted: false, sys: sys.clone(), - packrat, } } @@ -180,10 +174,7 @@ impl VCore { Ok(()) } - pub fn handle_notification( - &mut self, - ereport_buf: &mut [u8; crate::EREPORT_BUF_LEN], - ) { + pub fn handle_notification(&mut self, ereporter: &mut Ereporter) { let now = sys_get_timer().now; let asserted = self.is_pmalert_asserted(); @@ -196,7 +187,7 @@ impl VCore { // Don't produce another ereport if PMALERT_L was already asserted // without being deasserted. if !self.faulted { - self.read_pmbus_status(now, ereport_buf); + self.read_pmbus_status(now, ereporter); } // Clear the fault now so that PMALERT_L is reasserted if a // subsequent fault occurs. Note that if the fault *condition* @@ -232,11 +223,7 @@ impl VCore { self.sys.gpio_read(VCORE_TO_SP_ALERT_L) == 0 } - fn read_pmbus_status( - &self, - now: u64, - ereport_buf: &mut [u8; crate::EREPORT_BUF_LEN], - ) { + fn read_pmbus_status(&self, now: u64, ereporter: &mut Ereporter) { use pmbus::commands::raa229618::STATUS_WORD; // Read PMBus status registers and prepare an ereport. @@ -331,7 +318,7 @@ impl VCore { .map(|s| s.0); ringbuf_entry!(Trace::StatusMfrSpecific(status_mfr_specific)); - let status = super::PmbusStatus { + let status = PmbusStatus { word: status_word.map(|s| s.0).ok(), input: status_input.ok(), vout: status_vout.ok(), @@ -342,20 +329,16 @@ impl VCore { }; static RAIL: FixedStr<'static, 9> = FixedStr::from_str("VDD_VCORE"); - crate::try_send_ereport( - &self.packrat, - &mut ereport_buf[..], - crate::EreportClass::PmbusAlert, - crate::EreportKind::PmbusAlert { - refdes: FixedStr::from_str( - self.device.i2c_device().component_id(), - ), - rail: RAIL, - time: now, - pwr_good, - pmbus_status: status, - }, - ); + let ereport = PmbusAlert { + refdes: FixedStr::<{ crate::REFDES_LEN }>::from_str( + self.device.i2c_device().component_id(), + ), + rail: RAIL, + time: now, + pwr_good, + pmbus_status: status, + }; + ereporter.try_send_ereport(&ereport); // TODO(eliza): if POWER_GOOD has been deasserted, we should produce a // subsequent ereport for that. diff --git a/drv/psc-seq-server/Cargo.toml b/drv/psc-seq-server/Cargo.toml index e615a77c30..389aca2707 100644 --- a/drv/psc-seq-server/Cargo.toml +++ b/drv/psc-seq-server/Cargo.toml @@ -14,6 +14,7 @@ task-packrat-api = { path = "../../task/packrat-api", features = ["microcbor"] } userlib = { path = "../../sys/userlib", features = ["panic-messages"] } ringbuf = { path = "../../lib/ringbuf", features = ["counters"] } counters = { path = "../../lib/counters" } +ereports = { path = "../../lib/ereports" } static-cell = { path = "../../lib/static-cell" } microcbor.path = "../../lib/microcbor" fixedstr = { path = "../../lib/fixedstr", features = ["microcbor"] } diff --git a/drv/psc-seq-server/src/main.rs b/drv/psc-seq-server/src/main.rs index f0778433cf..56f6e8e99f 100644 --- a/drv/psc-seq-server/src/main.rs +++ b/drv/psc-seq-server/src/main.rs @@ -1191,6 +1191,7 @@ include!(concat!(env!("OUT_DIR"), "/i2c_config.rs")); mod ereport { use super::*; + pub(crate) use ereports::pwr::PmbusStatus; #[derive(Copy, Clone, Eq, PartialEq, microcbor::Encode)] pub(super) enum Class { @@ -1213,16 +1214,5 @@ mod ereport { pub(super) pmbus_status: Option, } - #[derive(Copy, Clone, Default, microcbor::Encode)] - pub(super) struct PmbusStatus { - pub(super) word: Option, - pub(super) input: Option, - pub(super) iout: Option, - pub(super) vout: Option, - pub(super) temp: Option, - pub(super) cml: Option, - pub(super) mfr: Option, - } - pub(super) type Ereport = task_packrat_api::Ereport; } diff --git a/lib/ereports/Cargo.toml b/lib/ereports/Cargo.toml new file mode 100644 index 0000000000..e416c2a5d6 --- /dev/null +++ b/lib/ereports/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "ereports" +version = "0.1.0" +edition = "2024" + +[dependencies] +microcbor = { path = "../microcbor" } +fixedstr = { path = "../fixedstr", features = ["microcbor"] } +drv-cpu-power-state = { path = "../../drv/cpu-power-state", features = ["microcbor"] } +drv-i2c-devices = { path = "../../drv/i2c-devices" } + +[lints] +workspace = true + +# This section is here to discourage RLS/rust-analyzer from doing test builds, +# since test builds don't work for cross compilation, and this crate dependencies +# on `userlib` (transitively, via `drv-i2c-devices`), which won't build on non-ARM +# targets. +[lib] +test = false +doctest = false +bench = false diff --git a/lib/ereports/src/cpu.rs b/lib/ereports/src/cpu.rs new file mode 100644 index 0000000000..a4c6f44675 --- /dev/null +++ b/lib/ereports/src/cpu.rs @@ -0,0 +1,65 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Common ereport types from the `hw.pwr.*` class hierarchy. + +use fixedstr::FixedString; +use microcbor::{Encode, EncodeFields}; + +/// An ereport representing an AMD CPU's `THERMTRIP` assertion. +#[derive(Clone, Encode)] +#[ereport(class = "hw.cpu.amd.thermtrip", version = 0)] +pub struct Thermtrip { + #[cbor(flatten)] + pub cpu: &'static HostCpuRefdes, + pub state: crate::pwr::CurrentState, +} + +/// An ereport representing an AMD CPU's `SMERR_L` assertion. +#[derive(Clone, Encode)] +#[ereport(class = "hw.cpu.amd.smerr", version = 0)] +pub struct Smerr { + #[cbor(flatten)] + pub cpu: &'static HostCpuRefdes, + pub state: crate::pwr::CurrentState, +} + +/// An ereport representing an unsupported AMD CPU. +#[derive(Clone, Encode)] +#[ereport(class = "hw.cpu.amd.unsup", version = 0)] +pub struct UnsupportedCpu { + #[cbor(flatten)] + pub cpu: &'static HostCpuRefdes, + pub coretype: CpuTypeBits, + pub rev: CpuTypeBits, +} + +/// An ereport representing a non- +#[derive(Clone, Encode)] +#[ereport(class = "hw.cpu.missing", version = 0)] +pub struct CpuMissing { + #[cbor(flatten)] + pub cpu: &'static HostCpuRefdes, +} + +#[derive(Clone, Encode)] +pub struct CpuTypeBits { + pub bits: [bool; BITS], + pub ok: bool, +} + +#[derive(Clone, EncodeFields)] +pub struct HostCpuRefdes { + /// On both Gimlet and Cosmo, the host CPU's refdes is `P0`. + pub refdes: FixedString<2>, + /// As the host CPU's `control-plane-agent` device ID is different from its + /// refdes, we must include both in the ereport. + /// + /// On Gimlet, this is `sp3-host-cpu` and on Cosmo, it is `sp5-host-cpu`. + // + // TODO(eliza): It would be cool if we could get this from the same value as + // where `control-plane-agent` gets it from...but in practice that's + // annoying. + pub dev_id: FixedString<12>, +} diff --git a/lib/ereports/src/lib.rs b/lib/ereports/src/lib.rs new file mode 100644 index 0000000000..dd134eb56f --- /dev/null +++ b/lib/ereports/src/lib.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Ereport message definitions shared between multiple tasks. + +#![no_std] + +pub mod cpu; +pub mod pwr; diff --git a/lib/ereports/src/pwr.rs b/lib/ereports/src/pwr.rs new file mode 100644 index 0000000000..d0614515c7 --- /dev/null +++ b/lib/ereports/src/pwr.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Common ereport types from the `hw.pwr.*` class hierarchy. + +use fixedstr::FixedStr; +use microcbor::{Encode, StaticCborLen}; + +/// An ereport representing a PMBus alert. +#[derive(Clone, Encode)] +#[ereport(class = "hw.pwr.pmbus.alert", version = 0)] +pub struct PmbusAlert { + pub refdes: FixedStr<'static, REFDES_LEN>, + pub rail: R, + pub time: u64, + pub pwr_good: Option, + pub pmbus_status: PmbusStatus, +} + +/// An ereport representing a failure to apply the BMR491 firmware mitigation. +#[derive(Clone, Encode)] +#[ereport(class = "hw.pwr.bmr491.mitfail", version = 0)] +pub struct Bmr491MitigationFailure { + pub refdes: FixedStr<'static, REFDES_LEN>, + pub failures: u32, + pub last_cause: drv_i2c_devices::bmr491::MitigationFailureKind, + pub succeeded: bool, +} + +/// PMBus status registers. +#[derive(Copy, Clone, Default, Encode)] +pub struct PmbusStatus { + pub word: Option, + pub input: Option, + pub iout: Option, + pub vout: Option, + pub temp: Option, + pub cml: Option, + pub mfr: Option, +} + +/// Represents the current power state when an event occurred, and the Hubris +/// timestamp at which the system transitioned to that state. +/// +/// When the event represented by an ereport is one which transitions the system +/// to a different state, the `CurrentState` represents the *prior* state, i.e. +/// the one at the time the event occurred, *not* the new state the ssytem will +/// transition to. +#[derive(Copy, Clone, Encode)] +pub struct CurrentState { + /// The current CPU power state. + pub cur: drv_cpu_power_state::PowerState, + /// The Hubris uptime, in milliseconds, at which the transition to this + /// state occurred. + pub since_ms: u64, +}