From 3f9b9123f788567fee819fc708dac2f7f055e137 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Wed, 4 Jun 2025 10:30:16 +0200 Subject: [PATCH 01/14] New APIs in the InvocationStatusTable and JournalTable to accommodate storing "archived" journals and statuses. These APIs will be used by the following commits. --- .../src/invocation_status_table/mod.rs | 65 +++++++++++ crates/storage-api/src/journal_table/mod.rs | 3 +- .../storage-api/src/journal_table_v2/mod.rs | 101 +++++++++++++++++- crates/types/src/identifiers.rs | 31 ------ 4 files changed, 164 insertions(+), 36 deletions(-) diff --git a/crates/storage-api/src/invocation_status_table/mod.rs b/crates/storage-api/src/invocation_status_table/mod.rs index 20fe6ee2a2..f4752f0c6e 100644 --- a/crates/storage-api/src/invocation_status_table/mod.rs +++ b/crates/storage-api/src/invocation_status_table/mod.rs @@ -364,6 +364,31 @@ impl InvocationStatus { } } + #[inline] + pub fn get_epoch(&self) -> InvocationEpoch { + match self { + InvocationStatus::Scheduled(_) | InvocationStatus::Inboxed(_) => 0, + InvocationStatus::Invoked(metadata) | InvocationStatus::Suspended { metadata, .. } => { + metadata.current_invocation_epoch + } + InvocationStatus::Completed(completed) => completed.invocation_epoch, + InvocationStatus::Free => 0, + } + } + + #[inline] + pub fn get_pinned_deployment(&self) -> Option<&PinnedDeployment> { + match self { + InvocationStatus::Scheduled(_) + | InvocationStatus::Inboxed(_) + | InvocationStatus::Free => None, + InvocationStatus::Invoked(metadata) | InvocationStatus::Suspended { metadata, .. } => { + metadata.pinned_deployment.as_ref() + } + InvocationStatus::Completed(completed) => completed.pinned_deployment.as_ref(), + } + } + #[inline] pub fn get_timestamps_mut(&mut self) -> Option<&mut StatusTimestamps> { match self { @@ -693,8 +718,11 @@ pub struct CompletedInvocation { pub completion_retention_duration: Duration, pub journal_retention_duration: Duration, + pub invocation_epoch: InvocationEpoch, + pub journal_metadata: JournalMetadata, pub pinned_deployment: Option, + pub completion_range_epoch_map: CompletionRangeEpochMap, } #[derive(PartialEq, Eq)] @@ -725,12 +753,14 @@ impl CompletedInvocation { completion_retention_duration: in_flight_invocation_metadata .completion_retention_duration, journal_retention_duration: in_flight_invocation_metadata.journal_retention_duration, + invocation_epoch: in_flight_invocation_metadata.current_invocation_epoch, journal_metadata: if journal_retention_policy == JournalRetentionPolicy::Retain { in_flight_invocation_metadata.journal_metadata } else { JournalMetadata::empty() }, pinned_deployment: in_flight_invocation_metadata.pinned_deployment, + completion_range_epoch_map: in_flight_invocation_metadata.completion_range_epoch_map, } } @@ -753,21 +783,40 @@ pub struct InvokedInvocationStatusLite { } pub trait ReadOnlyInvocationStatusTable { + /// Gets the latest invocation status fn get_invocation_status( &mut self, invocation_id: &InvocationId, ) -> impl Future> + Send; + /// Gets the latest epoch for the given invocation id + fn get_latest_epoch_for_invocation_status( + &mut self, + invocation_id: &InvocationId, + ) -> impl Future>> + Send; + + /// Epoch can be the latest as well. + fn get_invocation_status_for_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + ) -> impl Future> + Send; + fn all_invoked_invocations( &mut self, ) -> Result> + Send>; + /// Returns all the invocation statuses, including the archived ones. fn all_invocation_statuses( &self, range: RangeInclusive, ) -> Result> + Send>; } +/// ## Latest and archived invocations +/// +/// Current invocations and archived invocations are stored in separate key ranges. +/// Archived invocations are immutable and cannot outlive the "latest" invocation. pub trait InvocationStatusTable: ReadOnlyInvocationStatusTable { fn put_invocation_status( &mut self, @@ -775,9 +824,21 @@ pub trait InvocationStatusTable: ReadOnlyInvocationStatusTable { status: &InvocationStatus, ) -> impl Future> + Send; + /// Archive the invocation status for the given invocation epoch. + /// + /// This won't affect the latest/current status. + fn archive_invocation_status_to_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + status: &InvocationStatus, + ) -> impl Future> + Send; + + /// Delete the invocation status. If no epoch is provided, remove the latest/current. fn delete_invocation_status( &mut self, invocation_id: &InvocationId, + invocation_epoch: Option, ) -> impl Future> + Send; } @@ -841,8 +902,10 @@ mod test_util { response_result: ResponseResult::Success(Bytes::from_static(b"123")), completion_retention_duration: Duration::from_secs(60 * 60), journal_retention_duration: Duration::ZERO, + invocation_epoch: 0, journal_metadata: JournalMetadata::empty(), pinned_deployment: None, + completion_range_epoch_map: Default::default(), } } @@ -864,6 +927,8 @@ mod test_util { journal_metadata: JournalMetadata::empty(), journal_retention_duration: Duration::ZERO, pinned_deployment: None, + invocation_epoch: 0, + completion_range_epoch_map: Default::default(), } } } diff --git a/crates/storage-api/src/journal_table/mod.rs b/crates/storage-api/src/journal_table/mod.rs index 74f364b807..ee6dd2de68 100644 --- a/crates/storage-api/src/journal_table/mod.rs +++ b/crates/storage-api/src/journal_table/mod.rs @@ -9,8 +9,9 @@ // by the Apache License, Version 2.0. use crate::Result; +use crate::journal_table_v2::JournalEntryId; use futures_util::Stream; -use restate_types::identifiers::{EntryIndex, InvocationId, JournalEntryId, PartitionKey}; +use restate_types::identifiers::{EntryIndex, InvocationId, PartitionKey}; use restate_types::journal::enriched::EnrichedRawEntry; use restate_types::journal::{CompletionResult, EntryType}; use std::future::Future; diff --git a/crates/storage-api/src/journal_table_v2/mod.rs b/crates/storage-api/src/journal_table_v2/mod.rs index e57053b8ee..b7d8869905 100644 --- a/crates/storage-api/src/journal_table_v2/mod.rs +++ b/crates/storage-api/src/journal_table_v2/mod.rs @@ -10,26 +10,84 @@ use crate::Result; use futures_util::Stream; -use restate_types::identifiers::{EntryIndex, InvocationId, JournalEntryId, PartitionKey}; +use restate_types::identifiers::{EntryIndex, InvocationId, PartitionKey, WithInvocationId}; +use restate_types::invocation::InvocationEpoch; use restate_types::journal_v2::raw::{RawCommand, RawEntry}; use restate_types::journal_v2::{CompletionId, NotificationId}; use std::collections::HashMap; use std::future::Future; use std::ops::RangeInclusive; +#[derive(Eq, Hash, PartialEq, Clone, Copy, Debug)] +pub struct JournalEntryId { + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + journal_index: EntryIndex, +} + +impl JournalEntryId { + pub const fn from_parts( + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + journal_index: EntryIndex, + ) -> Self { + Self { + invocation_id, + invocation_epoch, + journal_index, + } + } + + pub fn invocation_epoch(&self) -> InvocationEpoch { + self.invocation_epoch + } + + pub fn journal_index(&self) -> EntryIndex { + self.journal_index + } +} + +impl From<(InvocationId, InvocationEpoch, EntryIndex)> for JournalEntryId { + fn from(value: (InvocationId, InvocationEpoch, EntryIndex)) -> Self { + Self::from_parts(value.0, value.1, value.2) + } +} + +impl WithInvocationId for JournalEntryId { + fn invocation_id(&self) -> InvocationId { + self.invocation_id + } +} + pub trait ReadOnlyJournalTable { + /// Get an entry from the latest/current journal. fn get_journal_entry( &mut self, invocation_id: InvocationId, - index: u32, + entry_index: EntryIndex, + ) -> impl Future>> + Send; + + fn get_journal_entry_for_epoch( + &mut self, + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + entry_index: EntryIndex, ) -> impl Future>> + Send; + /// Get the latest/current journal. fn get_journal( &mut self, invocation_id: InvocationId, length: EntryIndex, ) -> Result> + Send>; + /// Tell me if there's at least one entry here! + fn has_journal( + &mut self, + invocation_id: &InvocationId, + ) -> impl Future> + Send; + + /// Returns all the journals, including the archived ones. fn all_journals( &self, range: RangeInclusive, @@ -47,20 +105,55 @@ pub trait ReadOnlyJournalTable { ) -> impl Future>> + Send; } +/// ## Latest and archived journals +/// +/// Current journals and archived journals are stored in separate key ranges. +/// Archived journals are immutable and cannot outlive the "latest" journal. pub trait JournalTable: ReadOnlyJournalTable { - /// Related completion ids to this RawEntry, used to build the internal index + /// Related completion ids to this RawEntry, used to build the internal index. fn put_journal_entry( &mut self, invocation_id: InvocationId, + current_invocation_epoch: InvocationEpoch, index: u32, entry: &RawEntry, related_completion_ids: &[CompletionId], ) -> impl Future> + Send; - /// When length is available, it is suggested to provide it as it makes the delete more efficient. + /// Archive the current journal to the given invocation epoch. + /// + /// This won't affect the latest/current journal. + fn archive_journal_to_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + length: EntryIndex, + ) -> impl Future> + Send; + + /// Update epoch numbers for the given journal range. + /// + /// This will affect only the latest/current journal. + fn update_current_journal_epoch( + &mut self, + invocation_id: &InvocationId, + new_epoch: InvocationEpoch, + length: EntryIndex, + ) -> impl Future> + Send; + + /// Delete the journal. If no epoch is provided, remove the latest/current. fn delete_journal( &mut self, invocation_id: InvocationId, + invocation_epoch: Option, length: EntryIndex, ) -> impl Future> + Send; + + /// Delete the given journal range in the latest/current journal. + fn delete_journal_range( + &mut self, + invocation_id: InvocationId, + from_included: EntryIndex, + to_excluded: EntryIndex, + notification_ids_to_cleanup: &[NotificationId], + ) -> impl Future> + Send; } diff --git a/crates/types/src/identifiers.rs b/crates/types/src/identifiers.rs index de5064f90a..316bd23d91 100644 --- a/crates/types/src/identifiers.rs +++ b/crates/types/src/identifiers.rs @@ -670,37 +670,6 @@ fn encode_invocation_id( buf } -#[derive(Eq, Hash, PartialEq, Clone, Copy, Debug)] -pub struct JournalEntryId { - invocation_id: InvocationId, - journal_index: EntryIndex, -} - -impl JournalEntryId { - pub const fn from_parts(invocation_id: InvocationId, journal_index: EntryIndex) -> Self { - Self { - invocation_id, - journal_index, - } - } - - pub fn journal_index(&self) -> EntryIndex { - self.journal_index - } -} - -impl From<(InvocationId, EntryIndex)> for JournalEntryId { - fn from(value: (InvocationId, EntryIndex)) -> Self { - Self::from_parts(value.0, value.1) - } -} - -impl WithInvocationId for JournalEntryId { - fn invocation_id(&self) -> InvocationId { - self.invocation_id - } -} - #[derive(Debug, Clone, PartialEq, serde_with::SerializeDisplay, serde_with::DeserializeFromStr)] pub struct LambdaARN { partition: ByteString, From 04c217f63fbbdf962662fbdcf1699597a2c06fd6 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 12:58:16 +0200 Subject: [PATCH 02/14] Expose invocation epoch in SQL --- crates/storage-query-datafusion/src/context.rs | 1 + .../storage-query-datafusion/src/invocation_status/row.rs | 1 + .../src/invocation_status/schema.rs | 3 +++ crates/storage-query-datafusion/src/journal/row.rs | 5 ++++- crates/storage-query-datafusion/src/journal/schema.rs | 3 +++ crates/storage-query-datafusion/src/journal/table.rs | 6 ++++-- crates/storage-query-datafusion/src/table_docs.rs | 3 +++ 7 files changed, 19 insertions(+), 3 deletions(-) diff --git a/crates/storage-query-datafusion/src/context.rs b/crates/storage-query-datafusion/src/context.rs index 7a47d2a59e..e4bb56ac99 100644 --- a/crates/storage-query-datafusion/src/context.rs +++ b/crates/storage-query-datafusion/src/context.rs @@ -44,6 +44,7 @@ use crate::{analyzer, physical_optimizer}; const SYS_INVOCATION_VIEW: &str = "CREATE VIEW sys_invocation as SELECT ss.id, + ss.epoch, ss.target, ss.target_service_name, ss.target_service_key, diff --git a/crates/storage-query-datafusion/src/invocation_status/row.rs b/crates/storage-query-datafusion/src/invocation_status/row.rs index 112d3c2e4d..8680e0b425 100644 --- a/crates/storage-query-datafusion/src/invocation_status/row.rs +++ b/crates/storage-query-datafusion/src/invocation_status/row.rs @@ -28,6 +28,7 @@ pub(crate) fn append_invocation_status_row( let mut row = builder.row(); row.partition_key(invocation_id.partition_key()); + row.epoch(invocation_status.get_epoch()); if let Some(invocation_target) = invocation_status.invocation_target() { row.target_service_name(invocation_target.service_name()); if let Some(key) = invocation_target.key() { diff --git a/crates/storage-query-datafusion/src/invocation_status/schema.rs b/crates/storage-query-datafusion/src/invocation_status/schema.rs index 5f2f6d4a31..523c3ad140 100644 --- a/crates/storage-query-datafusion/src/invocation_status/schema.rs +++ b/crates/storage-query-datafusion/src/invocation_status/schema.rs @@ -21,6 +21,9 @@ define_table!(sys_invocation_status( /// [Invocation ID](/operate/invocation#invocation-identifier). id: DataType::LargeUtf8, + /// Invocation epoch. + epoch: DataType::UInt32, + /// Either `inboxed` or `scheduled` or `invoked` or `suspended` or `completed` status: DataType::LargeUtf8, diff --git a/crates/storage-query-datafusion/src/journal/row.rs b/crates/storage-query-datafusion/src/journal/row.rs index 7ed0552aaa..7d074cc2f2 100644 --- a/crates/storage-query-datafusion/src/journal/row.rs +++ b/crates/storage-query-datafusion/src/journal/row.rs @@ -14,7 +14,8 @@ use crate::table_util::format_using; use restate_service_protocol::codec::ProtobufRawEntryCodec; use restate_service_protocol_v4::entry_codec::ServiceProtocolV4Codec; use restate_storage_api::journal_table::JournalEntry; -use restate_types::identifiers::{JournalEntryId, WithInvocationId, WithPartitionKey}; +use restate_storage_api::journal_table_v2::JournalEntryId; +use restate_types::identifiers::{WithInvocationId, WithPartitionKey}; use restate_types::journal::Entry; use restate_types::journal::enriched::EnrichedEntryHeader; use restate_types::journal::{CompletePromiseEntry, GetPromiseEntry, PeekPromiseEntry}; @@ -37,6 +38,7 @@ pub(crate) fn append_journal_row( if row.is_id_defined() { row.id(format_using(output, &journal_entry_id.invocation_id())); } + row.epoch(journal_entry_id.invocation_epoch()); row.index(journal_entry_id.journal_index()); @@ -153,6 +155,7 @@ pub(crate) fn append_journal_row_v2( if row.is_id_defined() { row.id(format_using(output, &journal_entry_id.invocation_id())); } + row.epoch(journal_entry_id.invocation_epoch()); row.index(journal_entry_id.journal_index()); if row.is_entry_type_defined() { diff --git a/crates/storage-query-datafusion/src/journal/schema.rs b/crates/storage-query-datafusion/src/journal/schema.rs index 61f6c42665..0b4fd67520 100644 --- a/crates/storage-query-datafusion/src/journal/schema.rs +++ b/crates/storage-query-datafusion/src/journal/schema.rs @@ -21,6 +21,9 @@ define_table!(sys_journal ( /// [Invocation ID](/operate/invocation#invocation-identifier). id: DataType::LargeUtf8, + /// Invocation epoch + epoch: DataType::UInt32, + /// The index of this journal entry. index: DataType::UInt32, diff --git a/crates/storage-query-datafusion/src/journal/table.rs b/crates/storage-query-datafusion/src/journal/table.rs index bccef0bc5c..c00027c285 100644 --- a/crates/storage-query-datafusion/src/journal/table.rs +++ b/crates/storage-query-datafusion/src/journal/table.rs @@ -9,7 +9,9 @@ // by the Apache License, Version 2.0. use futures::Stream; -use restate_storage_api::journal_table_v2::ReadOnlyJournalTable as ReadOnlyJournalTableV2; +use restate_storage_api::journal_table_v2::{ + JournalEntryId, ReadOnlyJournalTable as ReadOnlyJournalTableV2, +}; use std::fmt::Debug; use std::ops::RangeInclusive; use std::sync::Arc; @@ -25,7 +27,7 @@ use crate::table_providers::{PartitionedTableProvider, ScanPartition}; use restate_partition_store::{PartitionStore, PartitionStoreManager}; use restate_storage_api::StorageError; use restate_storage_api::journal_table::{JournalEntry, ReadOnlyJournalTable}; -use restate_types::identifiers::{JournalEntryId, PartitionKey}; +use restate_types::identifiers::PartitionKey; use restate_types::journal_v2::raw::RawEntry; const NAME: &str = "sys_journal"; diff --git a/crates/storage-query-datafusion/src/table_docs.rs b/crates/storage-query-datafusion/src/table_docs.rs index d20cc0debd..f81805b6c4 100644 --- a/crates/storage-query-datafusion/src/table_docs.rs +++ b/crates/storage-query-datafusion/src/table_docs.rs @@ -103,6 +103,9 @@ pub fn sys_invocation_table_docs() -> OwnedTableDocs { let columns = vec![ sys_invocation_status.remove("id").expect("id should exist"), + sys_invocation_status + .remove("epoch") + .expect("epoch should exist"), sys_invocation_status .remove("target") .expect("target should exist"), From 0608d2295744778e5e633a8424b0bc385a15c1cd Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 12:59:37 +0200 Subject: [PATCH 03/14] Interface for the restart feature, plus add functionality in purge invocation/purge journal to selectively remove an invocation epoch --- crates/types/src/invocation/mod.rs | 59 +++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/crates/types/src/invocation/mod.rs b/crates/types/src/invocation/mod.rs index b3d3eb515c..87f3695870 100644 --- a/crates/types/src/invocation/mod.rs +++ b/crates/types/src/invocation/mod.rs @@ -970,12 +970,16 @@ pub enum TerminationFlavor { Cancel = 1, } -/// Message to purge an invocation. +/// Request to purge an invocation. #[derive(Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub struct PurgeInvocationRequest { pub invocation_id: InvocationId, #[serde(default)] pub response_sink: Option, + /// When epoch is the current/latest epoch, all the other epochs will be cleaned up as well. + #[serde(default, skip_serializing_if = "num_traits::Zero::is_zero")] + #[bilrost(6)] + pub invocation_epoch: InvocationEpoch, } // A hack to allow spancontext to be serialized. @@ -1142,6 +1146,59 @@ impl WithInvocationId for NotifySignalRequest { /// The invocation epoch represents the restarts count of the invocation, as seen from the Partition processor. pub type InvocationEpoch = u32; +/// Restart invocation command. See [restart::Request] +pub mod restart { + use super::*; + + /// Restart an invocation. + /// + /// This will restart the invocation, given its input is available. + #[derive(Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub struct Request { + pub invocation_id: InvocationId, + + /// What to do if the invocation is still running. + pub if_running: IfRunning, + + /// If set, it will override the configured completion_retention/journal_retention when the invocation was executed the first time. + /// If none of the completion_retention/journal_retention are configured, and neither this previous_attempt_retention, then the previous attempt won't be retained at all. + /// + /// To retain the previous attempt, the new attempt will take the invocation id of the previous attempt, the one used to trigger this reset, + /// and old invocation id will take a new randomly generated invocation id. + pub previous_attempt_retention: Option, + + /// What to do in case the invocation was a Workflow run (workflow service and workflow handler type) + pub apply_to_workflow_run: ApplyToWorkflowRun, + + /// Where to send the response for this command + pub response_sink: Option, + } + + impl WithInvocationId for Request { + fn invocation_id(&self) -> InvocationId { + self.invocation_id + } + } + + #[derive(Default, Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub enum IfRunning { + /// Kill the invocation, then restart it. + #[default] + Kill, + /// Fail the Restart command if the invocation is still running + Fail, + } + + #[derive(Default, Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub enum ApplyToWorkflowRun { + Nothing, + ClearOnlyPromises, + ClearOnlyState, + #[default] + ClearAllPromisesAndState, + } +} + mod serde_hacks { //! Module where we hide all the hacks to make back-compat working! From 5c6855bccd8b00ebab76e8ff0a02c1b2842308d1 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:00:02 +0200 Subject: [PATCH 04/14] Implementation of the Restart feature --- .../state_machine/lifecycle/archive.rs | 82 ++ .../state_machine/lifecycle/restart.rs | 1059 +++++++++++++++++ 2 files changed, 1141 insertions(+) create mode 100644 crates/worker/src/partition/state_machine/lifecycle/archive.rs create mode 100644 crates/worker/src/partition/state_machine/lifecycle/restart.rs diff --git a/crates/worker/src/partition/state_machine/lifecycle/archive.rs b/crates/worker/src/partition/state_machine/lifecycle/archive.rs new file mode 100644 index 0000000000..5860740db8 --- /dev/null +++ b/crates/worker/src/partition/state_machine/lifecycle/archive.rs @@ -0,0 +1,82 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use crate::debug_if_leader; +use crate::partition::state_machine::{CommandHandler, Error, StateMachineApplyContext}; +use restate_storage_api::invocation_status_table::{ + CompletedInvocation, InvocationStatus, InvocationStatusTable, JournalMetadata, +}; +use restate_storage_api::journal_table_v2::JournalTable; +use restate_types::identifiers::InvocationId; +use std::time::Duration; + +/// This command is used in restart to archive an invocation. +pub struct ArchiveInvocationCommand { + pub invocation_id: InvocationId, + pub completed_invocation: CompletedInvocation, + pub previous_attempt_retention_override: Option, +} + +impl<'ctx, 's: 'ctx, S> CommandHandler<&'ctx mut StateMachineApplyContext<'s, S>> + for ArchiveInvocationCommand +where + S: InvocationStatusTable + JournalTable, +{ + async fn apply(self, ctx: &'ctx mut StateMachineApplyContext<'s, S>) -> Result<(), Error> { + let ArchiveInvocationCommand { + invocation_id, + completed_invocation: mut completed_invocation_status, + previous_attempt_retention_override, + } = self; + + if let Some(previous_attempt_retention_override) = previous_attempt_retention_override { + completed_invocation_status.completion_retention_duration = + previous_attempt_retention_override; + completed_invocation_status.journal_retention_duration = + previous_attempt_retention_override; + } + + let should_retain_status = !completed_invocation_status + .completion_retention_duration + .is_zero(); + let should_retain_journal = should_retain_status + && !completed_invocation_status + .journal_retention_duration + .is_zero(); + + if !should_retain_journal { + completed_invocation_status.journal_metadata = JournalMetadata::empty(); + } + + completed_invocation_status.timestamps.update(); + + let journal_length = completed_invocation_status.journal_metadata.length; + let invocation_epoch = completed_invocation_status.invocation_epoch; + + if should_retain_status { + debug_if_leader!(ctx.is_leader, "Archiving invocation metadata"); + ctx.storage + .archive_invocation_status_to_epoch( + &invocation_id, + completed_invocation_status.invocation_epoch, + &InvocationStatus::Completed(completed_invocation_status), + ) + .await?; + } + if should_retain_journal { + debug_if_leader!(ctx.is_leader, "Archiving invocation journal"); + ctx.storage + .archive_journal_to_epoch(&invocation_id, invocation_epoch, journal_length) + .await?; + } + + Ok(()) + } +} diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart.rs b/crates/worker/src/partition/state_machine/lifecycle/restart.rs new file mode 100644 index 0000000000..c58b0505ff --- /dev/null +++ b/crates/worker/src/partition/state_machine/lifecycle/restart.rs @@ -0,0 +1,1059 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use crate::debug_if_leader; +use crate::partition::state_machine::entries::write_entry::WriteJournalEntryCommand; +use crate::partition::state_machine::lifecycle::ArchiveInvocationCommand; +use crate::partition::state_machine::{ + Action, CommandHandler, Error, StateMachineApplyContext, should_use_journal_table_v2, +}; +use restate_invoker_api::InvokeInputJournal; +use restate_service_protocol_v4::entry_codec::ServiceProtocolV4Codec; +use restate_storage_api::fsm_table::FsmTable; +use restate_storage_api::invocation_status_table::{ + CompletedInvocation, InFlightInvocationMetadata, InvocationStatus, InvocationStatusTable, + JournalRetentionPolicy, StatusTimestamps, +}; +use restate_storage_api::journal_table as journal_table_v1; +use restate_storage_api::journal_table_v2::{JournalTable, ReadOnlyJournalTable}; +use restate_storage_api::outbox_table::OutboxTable; +use restate_storage_api::promise_table::PromiseTable; +use restate_storage_api::state_table::StateTable; +use restate_types::errors::RESTARTED_INVOCATION_ERROR; +use restate_types::identifiers::InvocationId; +use restate_types::invocation::client::RestartInvocationResponse; +use restate_types::invocation::restart::{ApplyToWorkflowRun, IfRunning}; +use restate_types::invocation::{ + IngressInvocationResponseSink, InvocationEpoch, InvocationMutationResponseSink, + InvocationTargetType, ResponseResult, WorkflowHandlerType, +}; +use restate_types::journal_v2; +use restate_types::journal_v2::{ + CommandType, Entry, EntryIndex, EntryMetadata, OutputCommand, OutputResult, +}; +use std::time::Duration; +use tracing::trace; + +pub struct OnRestartInvocationCommand { + pub invocation_id: InvocationId, + pub if_running: IfRunning, + pub previous_attempt_retention: Option, + pub apply_to_workflow_run: ApplyToWorkflowRun, + pub response_sink: Option, +} + +impl<'ctx, 's: 'ctx, S> CommandHandler<&'ctx mut StateMachineApplyContext<'s, S>> + for OnRestartInvocationCommand +where + S: JournalTable + + InvocationStatusTable + + StateTable + + PromiseTable + + OutboxTable + + FsmTable + + journal_table_v1::ReadOnlyJournalTable, +{ + async fn apply(self, ctx: &'ctx mut StateMachineApplyContext<'s, S>) -> Result<(), Error> { + let OnRestartInvocationCommand { + invocation_id, + if_running, + previous_attempt_retention, + apply_to_workflow_run, + response_sink, + } = self; + + let invocation_status = ctx.get_invocation_status(&invocation_id).await?; + if invocation_status == InvocationStatus::Free { + trace!("Received restart command for unknown invocation with id '{invocation_id}'."); + ctx.reply_to_restart_invocation(response_sink, RestartInvocationResponse::NotFound); + return Ok(()); + } + + if !should_use_journal_table_v2(&invocation_status) { + trace!( + "Received restart command for invocation using the old journal table, this is unsupported." + ); + ctx.reply_to_restart_invocation(response_sink, RestartInvocationResponse::Unsupported); + return Ok(()); + } + + let (mut invocation, input_entry_index, original_journal_length) = match invocation_status { + InvocationStatus::Completed(completed_invocation) => { + let Some(input_entry_index) = ctx + .find_input_entry(invocation_id, completed_invocation.journal_metadata.length) + .await? + else { + trace!( + "Received restart command for a completed invocation, for which the journal/input entry was not retained." + ); + ctx.reply_to_restart_invocation( + response_sink, + RestartInvocationResponse::MissingInput, + ); + return Ok(()); + }; + + let original_journal_length = completed_invocation.journal_metadata.length; + ( + completed_invocation, + input_entry_index, + original_journal_length, + ) + } + InvocationStatus::Invoked(mut metadata) + | InvocationStatus::Suspended { mut metadata, .. } => { + if if_running == IfRunning::Fail { + trace!( + "Received restart command for invocation that is not in completed state.", + ); + ctx.reply_to_restart_invocation( + response_sink, + RestartInvocationResponse::StillRunning, + ); + return Ok(()); + } + + let Some(input_entry_index) = ctx + .find_input_entry(invocation_id, metadata.journal_metadata.length) + .await? + else { + trace!( + "Received restart command for a completed invocation, for which the journal/input entry was not retained." + ); + ctx.reply_to_restart_invocation( + response_sink, + RestartInvocationResponse::MissingInput, + ); + return Ok(()); + }; + + // --- Kill the running invocation. + + // Kill children first + ctx.kill_child_invocations( + &invocation_id, + metadata.journal_metadata.length, + &metadata, + ) + .await?; + + // Send abort invocation to invoker + ctx.do_send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); + + // Write output entry + let response_result = ResponseResult::Failure(RESTARTED_INVOCATION_ERROR); + WriteJournalEntryCommand { + invocation_id: self.invocation_id, + journal_metadata: &mut metadata.journal_metadata, + invocation_epoch: metadata.current_invocation_epoch, + entry: Entry::from(OutputCommand { + result: OutputResult::Failure(RESTARTED_INVOCATION_ERROR.into()), + name: Default::default(), + }) + .encode::(), + related_completion_ids: Default::default(), + } + .apply(ctx) + .await?; + + // Send responses out + ctx.send_response_to_sinks( + metadata.response_sinks.clone(), + response_result.clone(), + Some(invocation_id), + None, + Some(&metadata.invocation_target), + ) + .await?; + + // Notify invocation result + ctx.notify_invocation_result( + invocation_id, + metadata.invocation_target.clone(), + metadata.journal_metadata.span_context.clone(), + // SAFETY: We use this field to send back the notification to ingress, and not as part of the PP deterministic logic. + unsafe { metadata.timestamps.creation_time() }, + Err(( + RESTARTED_INVOCATION_ERROR.code(), + RESTARTED_INVOCATION_ERROR.message().to_string(), + )), + ); + + // Prepare the completed status + let original_journal_length = metadata.journal_metadata.length; + let journal_retention_policy = if metadata.journal_retention_duration.is_zero() { + JournalRetentionPolicy::Drop + } else { + JournalRetentionPolicy::Retain + }; + let completed_invocation = CompletedInvocation::from_in_flight_invocation_metadata( + metadata, + journal_retention_policy, + response_result, + ); + + ( + completed_invocation, + input_entry_index, + original_journal_length, + ) + } + is @ InvocationStatus::Scheduled(_) | is @ InvocationStatus::Inboxed(_) => { + trace!( + "Received restart command for invocation that didn't start yet, current status {:?}. The command will be ignored.", + is.discriminant().unwrap() + ); + ctx.reply_to_restart_invocation( + response_sink, + RestartInvocationResponse::NotStarted, + ); + return Ok(()); + } + InvocationStatus::Free => unreachable!(), + }; + + let new_epoch = invocation.invocation_epoch + 1; + + // If it's workflow run, we need some special logic + if invocation.invocation_target.invocation_target_ty() + == InvocationTargetType::Workflow(WorkflowHandlerType::Workflow) + { + let workflow_id = invocation.invocation_target.as_keyed_service_id().unwrap(); + match apply_to_workflow_run { + ApplyToWorkflowRun::Nothing => {} + ApplyToWorkflowRun::ClearOnlyPromises => { + ctx.do_clear_all_promises(workflow_id).await?; + } + ApplyToWorkflowRun::ClearOnlyState => { + ctx.do_clear_all_state(workflow_id).await?; + } + ApplyToWorkflowRun::ClearAllPromisesAndState => { + ctx.do_clear_all_promises(workflow_id.clone()).await?; + ctx.do_clear_all_state(workflow_id).await?; + } + }; + } + + // Archive the previous invocation + ArchiveInvocationCommand { + invocation_id, + completed_invocation: invocation.clone(), + previous_attempt_retention_override: previous_attempt_retention, + } + .apply(ctx) + .await?; + + // Scan the journal to see where the input entry is: should always be first entry, but with journal v2 in future we might have entries interleaving. + // Drop the suffix of the journal after the input entry. + let delete_journal_from = input_entry_index + 1; + let delete_journal_to = original_journal_length; + debug_if_leader!( + ctx.is_leader, + "Deleting journal range {delete_journal_from} to {delete_journal_to}", + ); + let notifications: Vec<_> = ctx + .storage + .get_notifications_index(invocation_id) + .await? + .into_keys() + .collect(); + ctx.storage + .delete_journal_range( + invocation_id, + delete_journal_from, + delete_journal_to, + ¬ifications, + ) + .await?; + ctx.storage + .update_current_journal_epoch(&invocation_id, new_epoch, delete_journal_from) + .await?; + + // --- Let's prepare the InFlightInvocationMetadata + + // Reset length and commands + invocation.journal_metadata.length = input_entry_index + 1; + invocation.journal_metadata.commands = 1 /* Only the input entry */; + + invocation + .completion_range_epoch_map + .add_trim_point(0, new_epoch); + + let in_flight_invocation_metadata = InFlightInvocationMetadata { + invocation_target: invocation.invocation_target, + source: invocation.source, + execution_time: invocation.execution_time, + idempotency_key: invocation.idempotency_key, + current_invocation_epoch: new_epoch, + completion_retention_duration: invocation.completion_retention_duration, + journal_retention_duration: invocation.journal_retention_duration, + completion_range_epoch_map: invocation.completion_range_epoch_map, + journal_metadata: invocation.journal_metadata, + hotfix_apply_cancellation_after_deployment_is_pinned: false, + + // Reset the pinned deployment + pinned_deployment: None, + // Reset timestamps + timestamps: StatusTimestamps::now(), + // Reset response sinks + response_sinks: Default::default(), + }; + + // Finally, it's time to invoke again! + ctx.invoke( + invocation_id, + in_flight_invocation_metadata, + InvokeInputJournal::NoCachedJournal, + ) + .await?; + + // Reply to the listener, restart went well + ctx.reply_to_restart_invocation(response_sink, RestartInvocationResponse::Ok); + + Ok(()) + } +} + +impl StateMachineApplyContext<'_, S> { + async fn find_input_entry( + &mut self, + invocation_id: InvocationId, + length: EntryIndex, + ) -> Result, Error> + where + S: ReadOnlyJournalTable, + { + // Find input entry + for i in 0..length { + let Some(entry) = self.storage.get_journal_entry(invocation_id, i).await? else { + return Ok(None); + }; + if entry.ty() == journal_v2::EntryType::Command(CommandType::Input) { + return Ok(Some(i)); + } + } + Ok(None) + } + + fn reply_to_restart_invocation( + &mut self, + response_sink: Option, + response: RestartInvocationResponse, + ) { + if response_sink.is_none() { + return; + } + let InvocationMutationResponseSink::Ingress(IngressInvocationResponseSink { request_id }) = + response_sink.unwrap(); + debug_if_leader!( + self.is_leader, + "Send restart response to request id '{:?}': {:?}", + request_id, + response + ); + + self.action_collector + .push(Action::ForwardRestartInvocationResponse { + request_id, + response, + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::partition::state_machine::Action; + use crate::partition::state_machine::tests::TestEnv; + use crate::partition::state_machine::tests::fixtures::{ + invoker_end_effect, invoker_entry_effect, pinned_deployment, + }; + use crate::partition::state_machine::tests::matchers::storage::{ + has_commands, has_journal_length, is_epoch, is_variant, + }; + use bytes::Bytes; + use bytestring::ByteString; + use futures::TryStreamExt; + use googletest::prelude::*; + use restate_storage_api::invocation_status_table::{ + InvocationStatusDiscriminants, ReadOnlyInvocationStatusTable, + }; + use restate_storage_api::promise_table::{ + Promise, PromiseResult, PromiseState, ReadOnlyPromiseTable, + }; + use restate_storage_api::state_table::ReadOnlyStateTable; + use restate_types::identifiers::PartitionProcessorRpcRequestId; + use restate_types::invocation::client::InvocationOutputResponse; + use restate_types::invocation::{ + InvocationTarget, ServiceInvocation, ServiceInvocationResponseSink, restart, + }; + use restate_types::journal_v2::{ + CommandType, CompletePromiseCommand, CompletePromiseValue, CompletionType, OutputCommand, + OutputResult, SetStateCommand, SleepCommand, + }; + use restate_types::service_protocol::ServiceProtocolVersion; + use restate_types::time::MillisSinceEpoch; + use restate_wal_protocol::Command; + use std::time::Duration; + + #[restate_core::test] + async fn restart_completed_invocation() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + let request_id = PartitionProcessorRpcRequestId::default(); + + // Create and complete a fresh invocation + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + response_sink: Some(ServiceInvocationResponseSink::Ingress { request_id }), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"123")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + ]) + .await; + + // InvocationStatus contains completed + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + + // We also retain the journal here + test_env + .verify_journal_components( + invocation_id, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + + // Now let's restart the invocation + let restart_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::RestartInvocation(restart::Request { + invocation_id, + if_running: Default::default(), + previous_attempt_retention: Default::default(), + apply_to_workflow_run: Default::default(), + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: restart_request_id, + }, + )), + })) + .await; + + // Assert restart response + assert_that!( + actions, + all!( + contains(pat!(Action::ForwardRestartInvocationResponse { + request_id: eq(restart_request_id), + response: eq(RestartInvocationResponse::Ok) + })), + contains(pat!(Action::Invoke { + invocation_id: eq(invocation_id), + })) + ) + ); + + // Verify the invocation is now in-flight + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + is_epoch(1) + )) + ); + + // Verify the journal contains only the input entry + test_env + .verify_journal_components(invocation_id, [CommandType::Input.into()]) + .await; + + // Verify we have the archived status and journal as well + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn restart_running_invocation() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + let request_id = PartitionProcessorRpcRequestId::default(); + + // Create an invocation but don't complete it + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + response_sink: Some(ServiceInvocationResponseSink::Ingress { request_id }), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + // Just add some entry here + invoker_entry_effect( + invocation_id, + SleepCommand { + wake_up_time: MillisSinceEpoch::now(), + completion_id: 1, + name: Default::default(), + }, + ), + ]) + .await; + + // InvocationStatus should be Invoked + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + has_journal_length(2), + has_commands(2), + is_epoch(0) + )) + ); + + // Now let's restart the invocation + let restart_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::RestartInvocation(restart::Request { + invocation_id, + if_running: IfRunning::Kill, + previous_attempt_retention: None, + apply_to_workflow_run: ApplyToWorkflowRun::Nothing, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: restart_request_id, + }, + )), + })) + .await; + + // Assert restart response + assert_that!( + actions, + all!( + // Verify the invocation gets killed (returning an error to ingress) + contains(pat!(Action::IngressResponse { + request_id: eq(request_id), + invocation_id: some(eq(invocation_id)), + response: eq(InvocationOutputResponse::Failure( + RESTARTED_INVOCATION_ERROR + )) + })), + // Verify the restart response is sent + contains(pat!(Action::ForwardRestartInvocationResponse { + request_id: eq(restart_request_id), + response: eq(RestartInvocationResponse::Ok) + })), + // Verify the invocation is restarted + contains(pat!(Action::Invoke { + invocation_id: eq(invocation_id), + })) + ) + ); + + // Verify the invocation is now in-flight with a new epoch, and contains only the input entry + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + is_epoch(1) + )) + ); + test_env + .verify_journal_components(invocation_id, [CommandType::Input.into()]) + .await; + + // Verify we have the archived status as well + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [ + CommandType::Input.into(), + CommandType::Sleep.into(), + CommandType::Output.into(), + ], + ) + .await; + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn restart_workflow_invocation() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_workflow(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + let request_id = PartitionProcessorRpcRequestId::default(); + let workflow_id = invocation_target.as_keyed_service_id().unwrap(); + + let promise_key = ByteString::from("promisekey"); + let promise_value = Bytes::copy_from_slice(b"promisevalue"); + + // Create and complete a workflow invocation + let actions = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + response_sink: Some(ServiceInvocationResponseSink::Ingress { request_id }), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + SetStateCommand { + key: ByteString::from("key"), + value: Bytes::copy_from_slice(b"value"), + name: Default::default(), + }, + ), + invoker_entry_effect( + invocation_id, + CompletePromiseCommand { + key: promise_key.clone(), + value: CompletePromiseValue::Success(promise_value.clone()), + completion_id: 1, + name: Default::default(), + }, + ), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"workflow result")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + ]) + .await; + + // Assert response + assert_that!( + actions, + contains(pat!(Action::IngressResponse { + request_id: eq(request_id), + invocation_id: some(eq(invocation_id)), + response: eq(InvocationOutputResponse::Success( + invocation_target.clone(), + Bytes::from_static(b"workflow result") + )) + })) + ); + + // Verify state and promises exist + let states: Vec<_> = test_env + .storage() + .get_all_user_states_for_service(&workflow_id) + .unwrap() + .try_collect() + .await + .unwrap(); + assert_that!(states, not(empty())); + let promise = test_env + .storage() + .get_promise(&workflow_id, &promise_key) + .await + .unwrap(); + assert_that!( + promise, + some(pat!(Promise { + state: eq(PromiseState::Completed(PromiseResult::Success( + promise_value + ))) + })) + ); + + // InvocationStatus contains completed + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::SetState.into(), + CommandType::CompletePromise.into(), + CompletionType::CompletePromise.into(), + CommandType::Output.into(), + ], + ) + .await; + + // Now let's restart the workflow invocation with ClearAllPromisesAndState + let restart_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::RestartInvocation(restart::Request { + invocation_id, + if_running: IfRunning::Kill, + previous_attempt_retention: None, + apply_to_workflow_run: ApplyToWorkflowRun::ClearAllPromisesAndState, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: restart_request_id, + }, + )), + })) + .await; + + // Assert restart response + assert_that!( + actions, + contains(pat!(Action::ForwardRestartInvocationResponse { + request_id: eq(restart_request_id), + response: eq(RestartInvocationResponse::Ok) + })) + ); + + // Verify the invocation is now in-flight with a new epoch + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + is_epoch(1) + )) + ); + test_env + .verify_journal_components(invocation_id, [CommandType::Input.into()]) + .await; + + // Verify state has been cleared + let states: Vec<_> = test_env + .storage() + .get_all_user_states_for_service(&workflow_id) + .unwrap() + .try_collect() + .await + .unwrap(); + assert_that!(states, empty()); + + // Verify promises have been cleared + let promise = test_env + .storage() + .get_promise(&workflow_id, &promise_key) + .await + .unwrap(); + assert_that!(promise, none()); + + // Verify previous journal and status were correctly archived + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [ + CommandType::Input.into(), + CommandType::SetState.into(), + CommandType::CompletePromise.into(), + CompletionType::CompletePromise.into(), + CommandType::Output.into(), + ], + ) + .await; + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn restart_running_invocation_without_retention() { + let mut test_env = TestEnv::create().await; + + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + let request_id = PartitionProcessorRpcRequestId::default(); + + // Create an invocation but don't complete it + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + response_sink: Some(ServiceInvocationResponseSink::Ingress { request_id }), + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + // Just add some entry here + invoker_entry_effect( + invocation_id, + SleepCommand { + wake_up_time: MillisSinceEpoch::now(), + completion_id: 1, + name: Default::default(), + }, + ), + ]) + .await; + + // InvocationStatus should be Invoked + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + has_journal_length(2), + has_commands(2), + is_epoch(0) + )) + ); + + // Now let's restart the invocation + let restart_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::RestartInvocation(restart::Request { + invocation_id, + if_running: IfRunning::Kill, + previous_attempt_retention: None, + apply_to_workflow_run: ApplyToWorkflowRun::Nothing, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: restart_request_id, + }, + )), + })) + .await; + + // Assert restart response + assert_that!( + actions, + all!( + // Verify the invocation gets killed (returning an error to ingress) + contains(pat!(Action::IngressResponse { + request_id: eq(request_id), + invocation_id: some(eq(invocation_id)), + response: eq(InvocationOutputResponse::Failure( + RESTARTED_INVOCATION_ERROR + )) + })), + // Verify the restart response is sent + contains(pat!(Action::ForwardRestartInvocationResponse { + request_id: eq(restart_request_id), + response: eq(RestartInvocationResponse::Ok) + })), + // Verify the invocation is restarted + contains(pat!(Action::Invoke { + invocation_id: eq(invocation_id), + })) + ) + ); + + // Verify the invocation is now in-flight with a new epoch, and contains only the input entry + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + is_epoch(1) + )) + ); + test_env + .verify_journal_components(invocation_id, [CommandType::Input.into()]) + .await; + + // No archived status + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(eq(InvocationStatus::Free)) + ); + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn restart_completed_invocation_without_journal_retention() { + let mut test_env = TestEnv::create().await; + + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + let request_id = PartitionProcessorRpcRequestId::default(); + + // Create and complete a fresh invocation + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + response_sink: Some(ServiceInvocationResponseSink::Ingress { request_id }), + completion_retention_duration: Duration::from_secs(60) * 60 * 24, + journal_retention_duration: Duration::ZERO, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"123")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + ]) + .await; + + // InvocationStatus contains completed + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0), + // No journal retained + has_journal_length(0), + has_commands(0), + )) + ); + + // Now let's restart the invocation + let restart_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::RestartInvocation(restart::Request { + invocation_id, + if_running: Default::default(), + previous_attempt_retention: Default::default(), + apply_to_workflow_run: Default::default(), + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: restart_request_id, + }, + )), + })) + .await; + + // Restart doesn't work + assert_that!( + actions, + all!( + contains(pat!(Action::ForwardRestartInvocationResponse { + request_id: eq(restart_request_id), + response: eq(RestartInvocationResponse::MissingInput) + })), + not(contains(pat!(Action::Invoke { + invocation_id: eq(invocation_id), + }))) + ) + ); + + // No mutations performed + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0), + // No journal retained + has_journal_length(0), + has_commands(0), + )) + ); + + test_env.shutdown().await; + } +} From 3a01af2a7560b47a671d05fc90516a2263480d55 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:00:59 +0200 Subject: [PATCH 05/14] Adapt the purge commands to understand invocation epoch. We also add an invariant that when you remove the last epoch, it removes all the old attempts too. This simplifies a bit the behavior and how we want to show it. --- .../state_machine/lifecycle/purge.rs | 378 ++++++++++++++++- .../state_machine/lifecycle/purge_journal.rs | 379 +++++++++++++++++- 2 files changed, 741 insertions(+), 16 deletions(-) diff --git a/crates/worker/src/partition/state_machine/lifecycle/purge.rs b/crates/worker/src/partition/state_machine/lifecycle/purge.rs index 2d237488fa..9d936ad01d 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/purge.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/purge.rs @@ -8,7 +8,8 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use crate::partition::state_machine::{CommandHandler, Error, StateMachineApplyContext}; +use crate::debug_if_leader; +use crate::partition::state_machine::{Action, CommandHandler, Error, StateMachineApplyContext}; use restate_storage_api::idempotency_table::IdempotencyTable; use restate_storage_api::invocation_status_table::{ CompletedInvocation, InvocationStatus, InvocationStatusTable, @@ -21,7 +22,8 @@ use restate_storage_api::state_table::StateTable; use restate_types::identifiers::{IdempotencyId, InvocationId}; use restate_types::invocation::client::PurgeInvocationResponse; use restate_types::invocation::{ - InvocationMutationResponseSink, InvocationTargetType, WorkflowHandlerType, + IngressInvocationResponseSink, InvocationEpoch, InvocationMutationResponseSink, + InvocationTargetType, WorkflowHandlerType, }; use restate_types::service_protocol::ServiceProtocolVersion; use tracing::trace; @@ -29,6 +31,7 @@ use tracing::trace; pub struct OnPurgeCommand { pub invocation_id: InvocationId, pub response_sink: Option, + pub invocation_epoch: InvocationEpoch, } impl<'ctx, 's: 'ctx, S> CommandHandler<&'ctx mut StateMachineApplyContext<'s, S>> for OnPurgeCommand @@ -45,8 +48,63 @@ where let OnPurgeCommand { invocation_id, response_sink, + invocation_epoch, } = self; - match ctx.get_invocation_status(&invocation_id).await? { + + let Some(latest_epoch) = ctx + .storage + .get_latest_epoch_for_invocation_status(&invocation_id) + .await? + else { + trace!("Received purge command for unknown invocation with id '{invocation_id}'."); + ctx.reply_to_purge_invocation(response_sink, PurgeInvocationResponse::NotFound); + return Ok(()); + }; + + match latest_epoch.cmp(&invocation_epoch) { + std::cmp::Ordering::Equal => { + // Purge all epochs + for epoch in 0..=latest_epoch { + Self::purge_for_epoch(ctx, invocation_id, None, epoch).await?; + } + ctx.reply_to_purge_invocation(response_sink, PurgeInvocationResponse::Ok); + } + std::cmp::Ordering::Greater => { + // Only purge my epoch + Self::purge_for_epoch(ctx, invocation_id, response_sink, invocation_epoch).await?; + } + std::cmp::Ordering::Less => { + trace!( + "Received purge command for invocation id {invocation_id} with unknown epoch {invocation_epoch}." + ); + ctx.reply_to_purge_invocation(response_sink, PurgeInvocationResponse::NotFound); + } + } + + Ok(()) + } +} + +impl OnPurgeCommand { + async fn purge_for_epoch<'ctx, 's: 'ctx, S>( + ctx: &'ctx mut StateMachineApplyContext<'s, S>, + invocation_id: InvocationId, + response_sink: Option, + invocation_epoch: InvocationEpoch, + ) -> Result<(), Error> + where + S: JournalTable + + InvocationStatusTable + + StateTable + + journal_table::JournalTable + + IdempotencyTable + + VirtualObjectStatusTable + + PromiseTable, + { + match ctx + .get_invocation_status_for_epoch(&invocation_id, invocation_epoch) + .await? + { InvocationStatus::Completed(CompletedInvocation { invocation_target, idempotency_key, @@ -59,7 +117,14 @@ where pinned_deployment.service_protocol_version >= ServiceProtocolVersion::V4 }); - ctx.do_free_invocation(invocation_id).await?; + debug_if_leader!( + ctx.is_leader, + restate.invocation.id = %invocation_id, + "Effect: Delete invocation" + ); + ctx.storage + .delete_invocation_status(&invocation_id, Some(invocation_epoch)) + .await?; // Also cleanup the associated idempotency key if any if let Some(idempotency_key) = idempotency_key { @@ -80,8 +145,7 @@ where .expect("Workflow methods must have keyed service id"); ctx.do_unlock_service(service_id.clone()).await?; - ctx.do_clear_all_state(service_id.clone(), invocation_id) - .await?; + ctx.do_clear_all_state(service_id.clone()).await?; ctx.do_clear_all_promises(service_id).await?; } @@ -89,6 +153,7 @@ where if journal_metadata.length != 0 { ctx.do_drop_journal( invocation_id, + Some(invocation_epoch), journal_metadata.length, should_remove_journal_table_v2, ) @@ -107,7 +172,306 @@ where ctx.reply_to_purge_invocation(response_sink, PurgeInvocationResponse::NotCompleted); } }; - Ok(()) } } + +impl StateMachineApplyContext<'_, S> { + fn reply_to_purge_invocation( + &mut self, + response_sink: Option, + response: PurgeInvocationResponse, + ) { + if response_sink.is_none() { + return; + } + let InvocationMutationResponseSink::Ingress(IngressInvocationResponseSink { request_id }) = + response_sink.unwrap(); + debug_if_leader!( + self.is_leader, + "Send purge response to request id '{:?}': {:?}", + request_id, + response + ); + + self.action_collector + .push(Action::ForwardPurgeInvocationResponse { + request_id, + response, + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::partition::state_machine::Action; + use crate::partition::state_machine::tests::TestEnv; + use crate::partition::state_machine::tests::fixtures::{ + invoker_end_effect, invoker_end_effect_for_epoch, invoker_entry_effect, + invoker_entry_effect_for_epoch, pinned_deployment, pinned_deployment_for_epoch, + }; + use crate::partition::state_machine::tests::matchers::storage::{ + has_commands, has_journal_length, is_epoch, is_variant, + }; + use bytes::Bytes; + use googletest::prelude::*; + use restate_storage_api::invocation_status_table::{ + InvocationStatusDiscriminants, ReadOnlyInvocationStatusTable, + }; + use restate_storage_api::journal_table_v2::ReadOnlyJournalTable; + use restate_types::identifiers::PartitionProcessorRpcRequestId; + use restate_types::invocation::{ + InvocationTarget, PurgeInvocationRequest, ServiceInvocation, restart, + }; + use restate_types::journal_v2::{CommandType, OutputCommand, OutputResult}; + use restate_types::service_protocol::ServiceProtocolVersion; + use restate_wal_protocol::Command; + use std::time::Duration; + + #[restate_core::test] + async fn purge_restarted_invocation() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + + // Create a fresh invocation, complete it, then restart it + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"123")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + Command::RestartInvocation(restart::Request { + invocation_id, + if_running: Default::default(), + previous_attempt_retention: Default::default(), + apply_to_workflow_run: Default::default(), + response_sink: None, + }), + ]) + .await; + + // Verify we have the archived status and journal + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + + // Now purge the epoch 0 + let purge_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::PurgeInvocation(PurgeInvocationRequest { + invocation_id, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: purge_request_id, + }, + )), + invocation_epoch: 0, + })) + .await; + + assert_that!( + actions, + contains(pat!(Action::ForwardPurgeInvocationResponse { + request_id: eq(purge_request_id), + response: eq(PurgeInvocationResponse::Ok) + })) + ); + + // Nothing left for epoch 0 + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(eq(InvocationStatus::Free)) + ); + assert_that!( + test_env + .storage() + .get_journal_entry_for_epoch(invocation_id, 0, 0) + .await, + ok(none()) + ); + + // Latest/epoch 1 left untouched + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + has_journal_length(1), + has_commands(1), + is_epoch(1) + )) + ); + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn purge_all_epochs_when_latest_provided() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + + // Create a fresh invocation, complete it, then restart it, then complete it + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"123")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + Command::RestartInvocation(restart::Request { + invocation_id, + if_running: Default::default(), + previous_attempt_retention: Default::default(), + apply_to_workflow_run: Default::default(), + response_sink: None, + }), + pinned_deployment_for_epoch(invocation_id, 1, ServiceProtocolVersion::V5), + invoker_entry_effect_for_epoch( + invocation_id, + 1, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"456")), + name: Default::default(), + }, + ), + invoker_end_effect_for_epoch(invocation_id, 1), + ]) + .await; + + // Verify we have both invocations + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(1) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + + // Now purge the epoch 1 + let purge_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::PurgeInvocation(PurgeInvocationRequest { + invocation_id, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: purge_request_id, + }, + )), + invocation_epoch: 1, + })) + .await; + + assert_that!( + actions, + contains(pat!(Action::ForwardPurgeInvocationResponse { + request_id: eq(purge_request_id), + response: eq(PurgeInvocationResponse::Ok) + })) + ); + + // Nothing left for epoch 0 neither 1 + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(eq(InvocationStatus::Free)) + ); + assert_that!( + test_env + .storage() + .get_journal_entry_for_epoch(invocation_id, 0, 0) + .await, + ok(none()) + ); + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(eq(InvocationStatus::Free)) + ); + assert_that!( + test_env.storage().get_journal_entry(invocation_id, 0).await, + ok(none()) + ); + + test_env.shutdown().await; + } +} diff --git a/crates/worker/src/partition/state_machine/lifecycle/purge_journal.rs b/crates/worker/src/partition/state_machine/lifecycle/purge_journal.rs index 40afcaccda..83f888e2c1 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/purge_journal.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/purge_journal.rs @@ -8,19 +8,23 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. -use crate::partition::state_machine::{CommandHandler, Error, StateMachineApplyContext}; +use crate::debug_if_leader; +use crate::partition::state_machine::{Action, CommandHandler, Error, StateMachineApplyContext}; use restate_storage_api::invocation_status_table::{InvocationStatus, InvocationStatusTable}; use restate_storage_api::journal_table; use restate_storage_api::journal_table_v2::JournalTable; use restate_types::identifiers::InvocationId; -use restate_types::invocation::InvocationMutationResponseSink; use restate_types::invocation::client::PurgeInvocationResponse; +use restate_types::invocation::{ + IngressInvocationResponseSink, InvocationEpoch, InvocationMutationResponseSink, +}; use restate_types::service_protocol::ServiceProtocolVersion; use tracing::trace; pub struct OnPurgeJournalCommand { pub invocation_id: InvocationId, pub response_sink: Option, + pub invocation_epoch: InvocationEpoch, } impl<'ctx, 's: 'ctx, S> CommandHandler<&'ctx mut StateMachineApplyContext<'s, S>> @@ -32,8 +36,62 @@ where let OnPurgeJournalCommand { invocation_id, response_sink, + invocation_epoch, } = self; - match ctx.get_invocation_status(&invocation_id).await? { + + let Some(latest_epoch) = ctx + .storage + .get_latest_epoch_for_invocation_status(&invocation_id) + .await? + else { + trace!( + "Received purge journal command for unknown invocation with id '{invocation_id}'." + ); + ctx.reply_to_purge_journal(response_sink, PurgeInvocationResponse::NotFound); + return Ok(()); + }; + + match latest_epoch.cmp(&invocation_epoch) { + std::cmp::Ordering::Equal => { + // Purge all epochs + for epoch in 0..=latest_epoch { + Self::purge_for_epoch(ctx, invocation_id, None, epoch, epoch == latest_epoch) + .await?; + } + ctx.reply_to_purge_journal(response_sink, PurgeInvocationResponse::Ok); + } + std::cmp::Ordering::Greater => { + // Only purge my epoch + Self::purge_for_epoch(ctx, invocation_id, response_sink, invocation_epoch, false) + .await?; + } + std::cmp::Ordering::Less => { + trace!( + "Received purge journal command for invocation id {invocation_id} with unknown epoch {invocation_epoch}." + ); + ctx.reply_to_purge_journal(response_sink, PurgeInvocationResponse::NotFound); + } + } + + Ok(()) + } +} + +impl OnPurgeJournalCommand { + async fn purge_for_epoch<'ctx, 's: 'ctx, S>( + ctx: &'ctx mut StateMachineApplyContext<'s, S>, + invocation_id: InvocationId, + response_sink: Option, + invocation_epoch: InvocationEpoch, + is_latest: bool, + ) -> Result<(), Error> + where + S: JournalTable + InvocationStatusTable + journal_table::JournalTable, + { + match ctx + .get_invocation_status_for_epoch(&invocation_id, invocation_epoch) + .await? + { InvocationStatus::Completed(mut completed) => { let should_remove_journal_table_v2 = completed .pinned_deployment @@ -46,6 +104,7 @@ where if completed.journal_metadata.length != 0 { ctx.do_drop_journal( invocation_id, + Some(invocation_epoch), completed.journal_metadata.length, should_remove_journal_table_v2, ) @@ -57,9 +116,22 @@ where completed.journal_metadata.commands = 0; // Update invocation status - ctx.storage - .put_invocation_status(&invocation_id, &InvocationStatus::Completed(completed)) - .await?; + if is_latest { + ctx.storage + .put_invocation_status( + &invocation_id, + &InvocationStatus::Completed(completed), + ) + .await?; + } else { + ctx.storage + .archive_invocation_status_to_epoch( + &invocation_id, + invocation_epoch, + &InvocationStatus::Completed(completed), + ) + .await?; + }; ctx.reply_to_purge_journal(response_sink, PurgeInvocationResponse::Ok); } InvocationStatus::Free => { @@ -75,11 +147,36 @@ where ctx.reply_to_purge_journal(response_sink, PurgeInvocationResponse::NotCompleted); } }; - Ok(()) } } +impl StateMachineApplyContext<'_, S> { + fn reply_to_purge_journal( + &mut self, + response_sink: Option, + response: PurgeInvocationResponse, + ) { + if response_sink.is_none() { + return; + } + let InvocationMutationResponseSink::Ingress(IngressInvocationResponseSink { request_id }) = + response_sink.unwrap(); + debug_if_leader!( + self.is_leader, + "Send purge response to request id '{:?}': {:?}", + request_id, + response + ); + + self.action_collector + .push(Action::ForwardPurgeJournalResponse { + request_id, + response, + }); + } +} + #[cfg(test)] mod tests { use super::*; @@ -87,10 +184,11 @@ mod tests { use crate::partition::state_machine::Action; use crate::partition::state_machine::tests::TestEnv; use crate::partition::state_machine::tests::fixtures::{ - invoker_end_effect, invoker_entry_effect, pinned_deployment, + invoker_end_effect, invoker_end_effect_for_epoch, invoker_entry_effect, + invoker_entry_effect_for_epoch, pinned_deployment, pinned_deployment_for_epoch, }; use crate::partition::state_machine::tests::matchers::storage::{ - has_commands, has_journal_length, is_variant, + has_commands, has_journal_length, is_epoch, is_variant, }; use bytes::Bytes; use bytestring::ByteString; @@ -103,6 +201,7 @@ mod tests { use restate_types::invocation::client::InvocationOutputResponse; use restate_types::invocation::{ InvocationTarget, PurgeInvocationRequest, ServiceInvocation, ServiceInvocationResponseSink, + restart, }; use restate_types::journal_v2::{CommandType, OutputCommand, OutputResult}; use restate_wal_protocol::Command; @@ -183,6 +282,7 @@ mod tests { .apply(Command::PurgeJournal(PurgeInvocationRequest { invocation_id, response_sink: None, + invocation_epoch: 0, })) .await; @@ -227,6 +327,7 @@ mod tests { .apply(Command::PurgeInvocation(PurgeInvocationRequest { invocation_id, response_sink: None, + invocation_epoch: 0, })) .await; @@ -246,4 +347,264 @@ mod tests { test_env.shutdown().await; } + + #[restate_core::test] + async fn purge_restarted_invocation() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + + // Create a fresh invocation, complete it, then restart it + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"123")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + Command::RestartInvocation(restart::Request { + invocation_id, + if_running: Default::default(), + previous_attempt_retention: Default::default(), + apply_to_workflow_run: Default::default(), + response_sink: None, + }), + ]) + .await; + + // Verify we have the archived status and journal + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + + // Now purge the journal epoch 0 + let purge_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::PurgeJournal(PurgeInvocationRequest { + invocation_id, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: purge_request_id, + }, + )), + invocation_epoch: 0, + })) + .await; + + assert_that!( + actions, + contains(pat!(Action::ForwardPurgeJournalResponse { + request_id: eq(purge_request_id), + response: eq(PurgeInvocationResponse::Ok) + })) + ); + + // Still got status, but no journal anymore + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + has_journal_length(0), + has_commands(0), + is_epoch(0) + )) + ); + assert_that!( + test_env + .storage() + .get_journal_entry_for_epoch(invocation_id, 0, 0) + .await, + ok(none()) + ); + + // Latest/epoch 1 left untouched + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Invoked), + has_journal_length(1), + has_commands(1), + is_epoch(1) + )) + ); + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn purge_all_epochs_when_latest_provided() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + + // Create a fresh invocation, complete it, then restart it, then complete it + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + }), + pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + invoker_entry_effect( + invocation_id, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"123")), + name: Default::default(), + }, + ), + invoker_end_effect(invocation_id), + Command::RestartInvocation(restart::Request { + invocation_id, + if_running: Default::default(), + previous_attempt_retention: Default::default(), + apply_to_workflow_run: Default::default(), + response_sink: None, + }), + pinned_deployment_for_epoch(invocation_id, 1, ServiceProtocolVersion::V5), + invoker_entry_effect_for_epoch( + invocation_id, + 1, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"456")), + name: Default::default(), + }, + ), + invoker_end_effect_for_epoch(invocation_id, 1), + ]) + .await; + + // Verify we have both invocations + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(0) + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + is_epoch(1) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [CommandType::Input.into(), CommandType::Output.into()], + ) + .await; + + // Now purge the epoch 1 + let purge_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(Command::PurgeJournal(PurgeInvocationRequest { + invocation_id, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: purge_request_id, + }, + )), + invocation_epoch: 1, + })) + .await; + + assert_that!( + actions, + contains(pat!(Action::ForwardPurgeJournalResponse { + request_id: eq(purge_request_id), + response: eq(PurgeInvocationResponse::Ok) + })) + ); + + // Nothing left for epoch 0 neither 1 + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + has_journal_length(0), + has_commands(0), + is_epoch(0) + )) + ); + assert_that!( + test_env + .storage() + .get_journal_entry_for_epoch(invocation_id, 0, 0) + .await, + ok(none()) + ); + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(all!( + is_variant(InvocationStatusDiscriminants::Completed), + has_journal_length(0), + has_commands(0), + is_epoch(1) + )) + ); + assert_that!( + test_env.storage().get_journal_entry(invocation_id, 0).await, + ok(none()) + ); + + test_env.shutdown().await; + } } From f5a1fe155cb69de5effdff23ce8387e8bb1b165a Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:01:21 +0200 Subject: [PATCH 06/14] Make sure the cleaner now deals with the epoch + fix bug --- crates/worker/src/partition/cleaner.rs | 152 +++++++++++++++++++------ 1 file changed, 117 insertions(+), 35 deletions(-) diff --git a/crates/worker/src/partition/cleaner.rs b/crates/worker/src/partition/cleaner.rs index 55a407715e..7b4b1b91ff 100644 --- a/crates/worker/src/partition/cleaner.rs +++ b/crates/worker/src/partition/cleaner.rs @@ -130,34 +130,33 @@ where // thus it will be cleaned up with the old timer. continue; }; - let Some(status_expiration_time) = SystemTime::from(completed_time) - .checked_add(completed_invocation.completion_retention_duration) - else { - // If sum overflow, then the cleanup time lies far enough in the future - continue; - }; let now = SystemTime::now(); - if now >= status_expiration_time { - restate_bifrost::append_to_bifrost( - bifrost, - Arc::new(Envelope { - header: Header { - source: bifrost_envelope_source.clone(), - dest: Destination::Processor { - partition_key: invocation_id.partition_key(), - dedup: None, + if let Some(status_expiration_time) = SystemTime::from(completed_time) + .checked_add(completed_invocation.completion_retention_duration) + { + if now >= status_expiration_time { + restate_bifrost::append_to_bifrost( + bifrost, + Arc::new(Envelope { + header: Header { + source: bifrost_envelope_source.clone(), + dest: Destination::Processor { + partition_key: invocation_id.partition_key(), + dedup: None, + }, }, - }, - command: Command::PurgeInvocation(PurgeInvocationRequest { - invocation_id, - response_sink: None, + command: Command::PurgeInvocation(PurgeInvocationRequest { + invocation_id, + response_sink: None, + invocation_epoch: completed_invocation.invocation_epoch, + }), }), - }), - ) - .await - .context("Cannot append to bifrost purge invocation")?; - continue; + ) + .await + .context("Cannot append to bifrost purge invocation")?; + continue; + } } // We don't cleanup the status yet, let's check if there's a journal to cleanup @@ -185,6 +184,7 @@ where command: Command::PurgeJournal(PurgeInvocationRequest { invocation_id, response_sink: None, + invocation_epoch: completed_invocation.invocation_epoch, }), }), ) @@ -209,10 +209,11 @@ mod tests { use restate_storage_api::StorageError; use restate_storage_api::invocation_status_table::{ CompletedInvocation, InFlightInvocationMetadata, InvocationStatus, - InvokedInvocationStatusLite, + InvokedInvocationStatusLite, JournalMetadata, }; use restate_types::Version; use restate_types::identifiers::{InvocationId, InvocationUuid}; + use restate_types::invocation::InvocationEpoch; use restate_types::partition_table::{FindPartition, PartitionTable}; use std::future::Future; use test_log::test; @@ -230,6 +231,26 @@ mod tests { std::future::pending() } + fn get_latest_epoch_for_invocation_status( + &mut self, + _invocation_id: &InvocationId, + ) -> impl Future>> + Send + { + todo!(); + #[allow(unreachable_code)] + std::future::pending() + } + + fn get_invocation_status_for_epoch( + &mut self, + _invocation_id: &InvocationId, + _invocation_epoch: InvocationEpoch, + ) -> impl Future> + Send { + todo!(); + #[allow(unreachable_code)] + std::future::pending() + } + fn all_invoked_invocations( &mut self, ) -> std::result::Result< @@ -266,6 +287,12 @@ mod tests { let expired_invocation = InvocationId::from_parts(PartitionKey::MIN, InvocationUuid::mock_random()); + let expired_invocation_with_epoch = + InvocationId::from_parts(PartitionKey::MIN, InvocationUuid::mock_random()); + let expired_journal = + InvocationId::from_parts(PartitionKey::MIN, InvocationUuid::mock_random()); + let expired_journal_with_epoch = + InvocationId::from_parts(PartitionKey::MIN, InvocationUuid::mock_random()); let not_expired_invocation_1 = InvocationId::from_parts(PartitionKey::MIN, InvocationUuid::mock_random()); let not_expired_invocation_2 = @@ -281,6 +308,41 @@ mod tests { ..CompletedInvocation::mock_neo() }), ), + ( + expired_invocation_with_epoch, + InvocationStatus::Completed(CompletedInvocation { + completion_retention_duration: Duration::ZERO, + invocation_epoch: 1, + ..CompletedInvocation::mock_neo() + }), + ), + ( + expired_journal, + InvocationStatus::Completed(CompletedInvocation { + completion_retention_duration: Duration::MAX, + journal_retention_duration: Duration::ZERO, + journal_metadata: JournalMetadata { + length: 2, + commands: 2, + span_context: Default::default(), + }, + ..CompletedInvocation::mock_neo() + }), + ), + ( + expired_journal_with_epoch, + InvocationStatus::Completed(CompletedInvocation { + completion_retention_duration: Duration::MAX, + journal_retention_duration: Duration::ZERO, + journal_metadata: JournalMetadata { + length: 2, + commands: 2, + span_context: Default::default(), + }, + invocation_epoch: 1, + ..CompletedInvocation::mock_neo() + }), + ), ( not_expired_invocation_1, InvocationStatus::Completed(CompletedInvocation { @@ -324,19 +386,39 @@ mod tests { }) .unwrap(); - let mut log_entries = bifrost.read_all(partition_id.into()).await.unwrap(); - let bifrost_message = log_entries - .remove(0) - .try_decode::() + let log_entries: Vec<_> = bifrost + .read_all(partition_id.into()) + .await .unwrap() - .unwrap(); + .into_iter() + .map(|e| e.try_decode::().unwrap().unwrap().command) + .collect(); assert_that!( - bifrost_message.command, - pat!(Command::PurgeInvocation(pat!(PurgeInvocationRequest { - invocation_id: eq(expired_invocation) - }))) + log_entries, + all!( + len(eq(4)), + contains(pat!(Command::PurgeInvocation(pat!( + PurgeInvocationRequest { + invocation_id: eq(expired_invocation), + invocation_epoch: eq(0), + } + )))), + contains(pat!(Command::PurgeInvocation(pat!( + PurgeInvocationRequest { + invocation_id: eq(expired_invocation_with_epoch), + invocation_epoch: eq(1), + } + )))), + contains(pat!(Command::PurgeJournal(pat!(PurgeInvocationRequest { + invocation_id: eq(expired_journal), + invocation_epoch: eq(0), + })))), + contains(pat!(Command::PurgeJournal(pat!(PurgeInvocationRequest { + invocation_id: eq(expired_journal_with_epoch), + invocation_epoch: eq(1), + })))), + ) ); - assert_that!(log_entries, empty()); } } From c5077f8c8314f08385538c999650fd802ac1d904 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:01:52 +0200 Subject: [PATCH 07/14] New admin REST API for the restart --- crates/admin/Cargo.toml | 1 + crates/admin/src/rest_api/error.rs | 24 +++ crates/admin/src/rest_api/invocations.rs | 231 +++++++++++++++++++++-- crates/admin/src/rest_api/mod.rs | 4 + 4 files changed, 244 insertions(+), 16 deletions(-) diff --git a/crates/admin/Cargo.toml b/crates/admin/Cargo.toml index b049c32bc8..ccc2b47284 100644 --- a/crates/admin/Cargo.toml +++ b/crates/admin/Cargo.toml @@ -24,6 +24,7 @@ restate-core = { workspace = true, features = ["options_schema"] } restate-errors = { workspace = true } restate-futures-util = { workspace = true } restate-metadata-store = { workspace = true } +restate-serde-util = { workspace = true } restate-service-client = { workspace = true } restate-service-protocol = { workspace = true, features = ["discovery"] } restate-storage-query-datafusion = { workspace = true } diff --git a/crates/admin/src/rest_api/error.rs b/crates/admin/src/rest_api/error.rs index f492715552..d26ab814c3 100644 --- a/crates/admin/src/rest_api/error.rs +++ b/crates/admin/src/rest_api/error.rs @@ -171,6 +171,30 @@ impl_meta_api_error!(InvocationWasAlreadyCompletedError: CONFLICT "The invocatio pub(crate) struct PurgeInvocationNotCompletedError(pub(crate) String); impl_meta_api_error!(PurgeInvocationNotCompletedError: CONFLICT "The invocation is not yet completed. An invocation can be purged only when completed."); +#[derive(Debug, thiserror::Error)] +#[error("The invocation '{0}' is still running.")] +pub(crate) struct RestartInvocationStillRunningError(pub(crate) String); +impl_meta_api_error!(RestartInvocationStillRunningError: CONFLICT "The invocation is still running. An invocation can be restarted only when completed, or if the query parameter if_running=kill is provided."); + +#[derive(Debug, thiserror::Error)] +#[error( + "Restarting the invocation '{0}' is not supported, because it was started using the old service protocol." +)] +pub(crate) struct RestartInvocationUnsupportedError(pub(crate) String); +impl_meta_api_error!(RestartInvocationUnsupportedError: UNPROCESSABLE_ENTITY "Restarting the invocation is not supported, because it was started using the old service protocol."); + +#[derive(Debug, thiserror::Error)] +#[error( + "The invocation '{0}' cannot be restarted because the input is not available. This indicates that the journal was already purged, or not retained at all." +)] +pub(crate) struct RestartInvocationMissingInputError(pub(crate) String); +impl_meta_api_error!(RestartInvocationMissingInputError: GONE "The invocation cannot be restarted because the input is not available. In order to restart an invocation, the journal must be available in order to read the input again. Journal can be retained after completion by enabling journal retention."); + +#[derive(Debug, thiserror::Error)] +#[error("The invocation '{0}' cannot be restarted because it's not running yet.")] +pub(crate) struct RestartInvocationNotStartedError(pub(crate) String); +impl_meta_api_error!(RestartInvocationNotStartedError: TOO_EARLY "The invocation cannot be restarted because it's not running yet, meaning it might have been scheduled or inboxed."); + // --- Old Meta API errors. Please don't use these anymore. /// This error is used by handlers to propagate API errors, diff --git a/crates/admin/src/rest_api/invocations.rs b/crates/admin/src/rest_api/invocations.rs index a2bc05f11c..c03a3836d7 100644 --- a/crates/admin/src/rest_api/invocations.rs +++ b/crates/admin/src/rest_api/invocations.rs @@ -9,8 +9,6 @@ // by the Apache License, Version 2.0. use super::error::*; -use std::sync::Arc; - use crate::generate_meta_api_error; use crate::rest_api::create_envelope_header; use crate::state::AdminServiceState; @@ -20,10 +18,15 @@ use okapi_operation::*; use restate_types::identifiers::{InvocationId, PartitionProcessorRpcRequestId, WithPartitionKey}; use restate_types::invocation::client::{ CancelInvocationResponse, InvocationClient, KillInvocationResponse, PurgeInvocationResponse, + RestartInvocationResponse, +}; +use restate_types::invocation::{ + InvocationEpoch, InvocationTermination, PurgeInvocationRequest, TerminationFlavor, restart, }; -use restate_types::invocation::{InvocationTermination, PurgeInvocationRequest, TerminationFlavor}; use restate_wal_protocol::{Command, Envelope}; use serde::Deserialize; +use std::sync::Arc; +use std::time::Duration; use tracing::warn; #[derive(Debug, Default, Deserialize, JsonSchema)] @@ -98,6 +101,7 @@ pub async fn delete_invocation( DeletionMode::Purge => Command::PurgeInvocation(PurgeInvocationRequest { invocation_id, response_sink: None, + invocation_epoch: 0, }), }; @@ -224,21 +228,37 @@ where generate_meta_api_error!(PurgeInvocationError: [InvocationNotFoundError, InvocationClientError, InvalidFieldError, PurgeInvocationNotCompletedError]); +#[derive(Debug, Default, Deserialize, JsonSchema)] +pub struct PurgeInvocationParams { + pub epoch: Option, +} + /// Purge an invocation #[openapi( summary = "Purge an invocation", - description = "Purge the given invocation. This cleanups all the state for the given invocation. This command applies only to completed invocations.", + description = "Purge the given invocation. This cleanups all the state for the given invocation, including its journal. This command applies only to completed invocations.", operation_id = "purge_invocation", tags = "invocation", - parameters(path( - name = "invocation_id", - description = "Invocation identifier.", - schema = "std::string::String" - )) + parameters( + path( + name = "invocation_id", + description = "Invocation identifier.", + schema = "std::string::String" + ), + query( + name = "epoch", + description = "Remove the specific epoch. If not provided, epoch 0 will be removed. When removing the latest epoch, all the previous epochs will be cleaned up as well.", + required = false, + style = "simple", + allow_empty_value = false, + schema = InvocationEpoch, + ) + ) )] pub async fn purge_invocation( State(state): State>, Path(invocation_id): Path, + Query(PurgeInvocationParams { epoch }): Query, ) -> Result<(), PurgeInvocationError> where IC: InvocationClient, @@ -249,7 +269,11 @@ where match state .invocation_client - .purge_invocation(PartitionProcessorRpcRequestId::new(), invocation_id) + .purge_invocation( + PartitionProcessorRpcRequestId::new(), + invocation_id, + epoch.unwrap_or_default(), + ) .await .map_err(InvocationClientError)? { @@ -273,15 +297,26 @@ generate_meta_api_error!(PurgeJournalError: [InvocationNotFoundError, Invocation description = "Purge the given invocation journal. This cleanups only the journal for the given invocation, retaining the metadata. This command applies only to completed invocations.", operation_id = "purge_journal", tags = "invocation", - parameters(path( - name = "invocation_id", - description = "Invocation identifier.", - schema = "std::string::String" - )) + parameters( + path( + name = "invocation_id", + description = "Invocation identifier.", + schema = "std::string::String" + ), + query( + name = "epoch", + description = "Remove the specific epoch. If not provided, epoch 0 will be removed. When removing the latest epoch, all the previous epochs will be cleaned up as well.", + required = false, + style = "simple", + allow_empty_value = false, + schema = InvocationEpoch, + ) + ) )] pub async fn purge_journal( State(state): State>, Path(invocation_id): Path, + Query(PurgeInvocationParams { epoch }): Query, ) -> Result<(), PurgeJournalError> where IC: InvocationClient, @@ -292,7 +327,11 @@ where match state .invocation_client - .purge_journal(PartitionProcessorRpcRequestId::new(), invocation_id) + .purge_journal( + PartitionProcessorRpcRequestId::new(), + invocation_id, + epoch.unwrap_or_default(), + ) .await .map_err(InvocationClientError)? { @@ -307,3 +346,163 @@ where Ok(()) } + +/// What to do if the invocation is still running. By default, the running invocation will be killed. +#[derive(Default, Debug, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum RestartInvocationIfRunning { + /// Kill the invocation, sending a failure to the waiting callers, then restart the invocation. + #[default] + Kill, + /// Fail the Restart operation if the invocation is still running. + Fail, +} + +impl From for restart::IfRunning { + fn from(value: RestartInvocationIfRunning) -> Self { + match value { + RestartInvocationIfRunning::Kill => restart::IfRunning::Kill, + RestartInvocationIfRunning::Fail => restart::IfRunning::Fail, + } + } +} + +/// What to do in case of restarting a workflow run. By default, clears all promises and state. +#[derive(Default, Debug, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum RestartInvocationApplyToWorkflowRun { + Nothing, + /// Clear all the promises, retain the state + ClearAllPromises, + /// Clear all the state, retain the promises + ClearAllState, + /// Clear all the promises and state + #[default] + ClearAllPromisesAndState, +} + +impl From for restart::ApplyToWorkflowRun { + fn from(value: RestartInvocationApplyToWorkflowRun) -> Self { + match value { + RestartInvocationApplyToWorkflowRun::Nothing => restart::ApplyToWorkflowRun::Nothing, + RestartInvocationApplyToWorkflowRun::ClearAllPromises => { + restart::ApplyToWorkflowRun::ClearOnlyPromises + } + RestartInvocationApplyToWorkflowRun::ClearAllState => { + restart::ApplyToWorkflowRun::ClearOnlyState + } + RestartInvocationApplyToWorkflowRun::ClearAllPromisesAndState => { + restart::ApplyToWorkflowRun::ClearAllPromisesAndState + } + } + } +} + +#[derive(Debug, Default, Deserialize, JsonSchema)] +pub struct RestartInvocationParams { + pub if_running: Option, + #[serde( + default, + with = "serde_with::As::>" + )] + #[schemars(with = "Option")] + pub previous_attempt_retention: Option, + pub apply_to_workflow_run: Option, +} + +generate_meta_api_error!(RestartInvocationError: [ + InvocationNotFoundError, + InvocationClientError, + InvalidFieldError, + RestartInvocationStillRunningError, + RestartInvocationUnsupportedError, + RestartInvocationMissingInputError, + RestartInvocationNotStartedError +]); + +/// Restart an invocation +#[openapi( + summary = "Restart an invocation", + description = "Restart the given invocation. This will restart the invocation, given its input is available.", + operation_id = "restart_invocation", + tags = "invocation", + parameters( + path( + name = "invocation_id", + description = "Invocation identifier.", + schema = "std::string::String" + ), + query( + name = "if_running", + description = "What to do if the invocation is still running. By default, the running invocation will be killed.", + required = false, + style = "simple", + allow_empty_value = false, + schema = RestartInvocationIfRunning, + ), + query( + name = "previous_attempt_retention", + description = "If set, it will override the configured completion_retention/journal_retention when the invocation was executed the first time. If none of the completion_retention/journal_retention are configured, and neither this previous_attempt_retention, then the previous attempt won't be retained at all. Can be configured using humantime format or ISO8601.", + required = false, + style = "simple", + allow_empty_value = false, + schema = String, + ), + query( + name = "apply_to_workflow_run", + description = "What to do in case of restarting a workflow run. By default, clears all promises and state.", + required = false, + style = "simple", + allow_empty_value = false, + schema = RestartInvocationApplyToWorkflowRun, + ) + ) +)] +pub async fn restart_invocation( + State(state): State>, + Path(invocation_id): Path, + Query(RestartInvocationParams { + if_running, + previous_attempt_retention, + apply_to_workflow_run, + }): Query, +) -> Result<(), RestartInvocationError> +where + IC: InvocationClient, +{ + let invocation_id = invocation_id + .parse::() + .map_err(|e| InvalidFieldError("invocation_id", e.to_string()))?; + + match state + .invocation_client + .restart_invocation( + PartitionProcessorRpcRequestId::new(), + invocation_id, + if_running.unwrap_or_default().into(), + previous_attempt_retention, + apply_to_workflow_run.unwrap_or_default().into(), + ) + .await + .map_err(InvocationClientError)? + { + RestartInvocationResponse::Ok => {} + RestartInvocationResponse::NotFound => { + Err(InvocationNotFoundError(invocation_id.to_string()))? + } + RestartInvocationResponse::StillRunning => Err(RestartInvocationStillRunningError( + invocation_id.to_string(), + ))?, + RestartInvocationResponse::Unsupported => { + Err(RestartInvocationUnsupportedError(invocation_id.to_string()))? + } + RestartInvocationResponse::MissingInput => Err(RestartInvocationMissingInputError( + invocation_id.to_string(), + ))?, + RestartInvocationResponse::NotStarted => { + Err(RestartInvocationNotStartedError(invocation_id.to_string()))? + } + }; + + Ok(()) +} diff --git a/crates/admin/src/rest_api/mod.rs b/crates/admin/src/rest_api/mod.rs index 00eb1d1bed..6faedeffbb 100644 --- a/crates/admin/src/rest_api/mod.rs +++ b/crates/admin/src/rest_api/mod.rs @@ -102,6 +102,10 @@ where "/invocations/:invocation_id/purge-journal", patch(openapi_handler!(invocations::purge_journal)), ) + .route( + "/invocations/:invocation_id/restart", + patch(openapi_handler!(invocations::restart_invocation)), + ) .route( "/subscriptions", post(openapi_handler!(subscriptions::create_subscription)), From 91441a231d2e9ef906125535968165b9649b051c Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:02:07 +0200 Subject: [PATCH 08/14] Implement the new storage APIs --- crates/partition-store/Cargo.toml | 1 + .../proto/dev/restate/storage/v1/domain.proto | 6 + .../src/invocation_status_table/mod.rs | 222 +++++++++- .../partition-store/src/journal_table/mod.rs | 4 +- .../src/journal_table_v2/mod.rs | 386 ++++++++++++++++-- crates/partition-store/src/keys.rs | 6 + crates/partition-store/src/partition_store.rs | 7 +- crates/partition-store/src/protobuf_types.rs | 57 ++- .../src/tests/journal_table_v2_test/mod.rs | 7 +- 9 files changed, 622 insertions(+), 74 deletions(-) diff --git a/crates/partition-store/Cargo.toml b/crates/partition-store/Cargo.toml index 18836ebe00..6f00800ece 100644 --- a/crates/partition-store/Cargo.toml +++ b/crates/partition-store/Cargo.toml @@ -25,6 +25,7 @@ enum-map = { workspace = true } futures = { workspace = true } futures-util = { workspace = true } humantime = { workspace = true } +itertools = { workspace = true } paste = { workspace = true } prost = { workspace = true } prost-types = { workspace = true } diff --git a/crates/partition-store/proto/dev/restate/storage/v1/domain.proto b/crates/partition-store/proto/dev/restate/storage/v1/domain.proto index 22e46c1bf0..9fc4c1a264 100644 --- a/crates/partition-store/proto/dev/restate/storage/v1/domain.proto +++ b/crates/partition-store/proto/dev/restate/storage/v1/domain.proto @@ -189,6 +189,10 @@ message InvocationV2Lite { uint32 current_invocation_epoch = 27; } +message InvocationStatusV2OnlyEpoch { + uint32 current_invocation_epoch = 27; +} + // TODO remove this after 1.1 message InvocationStatus { @@ -587,6 +591,8 @@ message Entry { uint32 signal_idx = 8; string signal_name = 9; } + + uint32 invocation_epoch = 10; } message ResponseResult { diff --git a/crates/partition-store/src/invocation_status_table/mod.rs b/crates/partition-store/src/invocation_status_table/mod.rs index 5fabb6aee9..698dfcda9c 100644 --- a/crates/partition-store/src/invocation_status_table/mod.rs +++ b/crates/partition-store/src/invocation_status_table/mod.rs @@ -16,6 +16,7 @@ use crate::{PartitionStore, TableKind, TableScanIterationDecision}; use crate::{PartitionStoreTransaction, StorageAccess}; use futures::Stream; use futures_util::stream; +use itertools::Itertools; use restate_rocksdb::RocksDbPerfGuard; use restate_storage_api::invocation_status_table::{ InvocationStatus, InvocationStatusDiscriminants, InvocationStatusTable, @@ -61,12 +62,32 @@ define_table_key!( ) ); +define_table_key!( + TableKind::InvocationStatus, + KeyKind::ArchivedInvocationStatus, + ArchivedInvocationStatusKey( + partition_key: PartitionKey, + invocation_uuid: InvocationUuid, + invocation_epoch: InvocationEpoch + ) +); + fn create_invocation_status_key(invocation_id: &InvocationId) -> InvocationStatusKey { InvocationStatusKey::default() .partition_key(invocation_id.partition_key()) .invocation_uuid(invocation_id.invocation_uuid()) } +fn create_archived_invocation_status_key( + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, +) -> ArchivedInvocationStatusKey { + ArchivedInvocationStatusKey::default() + .partition_key(invocation_id.partition_key()) + .invocation_uuid(invocation_id.invocation_uuid()) + .invocation_epoch(invocation_epoch) +} + impl PartitionStoreProtobufValue for InvocationStatus { type ProtobufType = crate::protobuf_types::v1::InvocationStatusV2; } @@ -181,13 +202,96 @@ fn try_migrate_and_get_invocation_status( }) } +/// Ready only the epoch of the invocation status. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct InvocationStatusV2OnlyEpoch { + pub current_invocation_epoch: InvocationEpoch, +} + +impl PartitionStoreProtobufValue for InvocationStatusV2OnlyEpoch { + type ProtobufType = crate::protobuf_types::v1::InvocationStatusV2OnlyEpoch; +} + +fn get_latest_epoch_for_invocation_status( + storage: &mut S, + invocation_id: &InvocationId, +) -> Result> { + let _x = RocksDbPerfGuard::new("get-latest-epoch-for-invocation-status"); + Ok(storage + .get_value::<_, InvocationStatusV2OnlyEpoch>(create_invocation_status_key(invocation_id))? + .map(|is| is.current_invocation_epoch)) +} + +fn get_invocation_status_for_epoch( + storage: &mut S, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, +) -> Result { + let _x = RocksDbPerfGuard::new("get-invocation-status-for-epoch"); + + // Try archived first + if let Some(s) = storage.get_value::<_, InvocationStatus>( + create_archived_invocation_status_key(invocation_id, invocation_epoch), + )? { + return Ok(s); + } + + // Nope, try to get the latest and check if the epoch is the same + if let Some(s) = + storage.get_value::<_, InvocationStatus>(create_invocation_status_key(invocation_id))? + { + if s.get_epoch() == invocation_epoch { + return Ok(s); + } + } + + Ok(InvocationStatus::Free) +} + fn delete_invocation_status( storage: &mut S, invocation_id: &InvocationId, + invocation_epoch: Option, ) -> Result<()> { + if let Some(invocation_epoch_to_delete) = invocation_epoch { + // Check if the epoch to delete is latest + let current_epoch = storage + .get_value::<_, InvocationStatusV2OnlyEpoch>(create_invocation_status_key( + invocation_id, + ))? + .map(|is| is.current_invocation_epoch) + .unwrap_or(InvocationEpoch::MAX); + + if current_epoch != invocation_epoch_to_delete { + // Remove the archived status + storage.delete_key(&create_archived_invocation_status_key( + invocation_id, + invocation_epoch_to_delete, + ))?; + return Ok(()); + } + } + + // Just delete the latest // TODO remove this once we remove the old InvocationStatus storage.delete_key(&create_invocation_status_key_v1(invocation_id))?; - storage.delete_key(&create_invocation_status_key(invocation_id)) + storage.delete_key(&create_invocation_status_key(invocation_id))?; + + Ok(()) +} + +fn archive_invocation_status_to_epoch( + storage: &mut S, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + status: &InvocationStatus, +) -> Result<()> { + let key = create_archived_invocation_status_key(invocation_id, invocation_epoch); + match status { + InvocationStatus::Free => storage.delete_key(&key)?, + _ => storage.put_kv(key, status)?, + } + Ok(()) } fn invoked_invocations( @@ -225,7 +329,8 @@ fn all_invocation_status( storage: &S, range: RangeInclusive, ) -> Result> + Send + use<'_, S>> { - Ok(stream::iter( + // TODO remove when we remove invocation status v1 + let invocation_status_v1_iterator = OwnedIterator::new(storage.iterator_from(FullScanPartitionKeyRange::< InvocationStatusKeyV1, >(range.clone()))?) @@ -238,23 +343,50 @@ fn all_invocation_status( InvocationId::from_parts(partition_key, invocation_uuid), state_value.0, )) - }) - .chain( - OwnedIterator::new(storage.iterator_from(FullScanPartitionKeyRange::< - InvocationStatusKey, - >(range.clone()))?) - .map(|(mut key, mut value)| { - let state_key = InvocationStatusKey::deserialize_from(&mut key)?; - let state_value = InvocationStatus::decode(&mut value)?; - - let (partition_key, invocation_uuid) = state_key.into_inner_ok_or()?; - Ok(( - InvocationId::from_parts(partition_key, invocation_uuid), - state_value, - )) - }), - ), - )) + }); + + let invocation_status_iterator = + OwnedIterator::new(storage.iterator_from(FullScanPartitionKeyRange::< + InvocationStatusKey, + >(range.clone()))?) + .map(|(mut key, mut value)| { + let state_key = InvocationStatusKey::deserialize_from(&mut key)?; + let state_value = InvocationStatus::decode(&mut value)?; + + let (partition_key, invocation_uuid) = state_key.into_inner_ok_or()?; + Ok(( + InvocationId::from_parts(partition_key, invocation_uuid), + state_value, + )) + }); + + let archived_status_iterator = + OwnedIterator::new(storage.iterator_from(FullScanPartitionKeyRange::< + ArchivedInvocationStatusKey, + >(range))?) + .map(|(mut key, mut value)| { + let state_key = ArchivedInvocationStatusKey::deserialize_from(&mut key)?; + let state_value = InvocationStatus::decode(&mut value)?; + + let (partition_key, invocation_uuid, _) = state_key.into_inner_ok_or()?; + Ok(( + InvocationId::from_parts(partition_key, invocation_uuid), + state_value, + )) + }); + + Ok(stream::iter(invocation_status_v1_iterator.chain( + archived_status_iterator.merge_by(invocation_status_iterator, |is1, is2| { + match (is1, is2) { + (Ok((id1, status1)), Ok((id2, status2))) if id1 == id2 => { + status1.get_epoch() <= status2.get_epoch() + } + (Ok((id1, _)), Ok((id2, _))) => id1 <= id2, + // Doesn't matter if there's an error in between + (_, _) => true, + } + }), + ))) } // TODO remove this once we remove the old InvocationStatus @@ -301,6 +433,23 @@ impl ReadOnlyInvocationStatusTable for PartitionStore { get_invocation_status(self, invocation_id) } + async fn get_latest_epoch_for_invocation_status( + &mut self, + invocation_id: &InvocationId, + ) -> Result> { + self.assert_partition_key(invocation_id)?; + get_latest_epoch_for_invocation_status(self, invocation_id) + } + + async fn get_invocation_status_for_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + ) -> Result { + self.assert_partition_key(invocation_id)?; + get_invocation_status_for_epoch(self, invocation_id, invocation_epoch) + } + fn all_invoked_invocations( &mut self, ) -> Result> + Send> { @@ -327,6 +476,23 @@ impl ReadOnlyInvocationStatusTable for PartitionStoreTransaction<'_> { try_migrate_and_get_invocation_status(self, invocation_id) } + async fn get_latest_epoch_for_invocation_status( + &mut self, + invocation_id: &InvocationId, + ) -> Result> { + self.assert_partition_key(invocation_id)?; + get_latest_epoch_for_invocation_status(self, invocation_id) + } + + async fn get_invocation_status_for_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + ) -> Result { + self.assert_partition_key(invocation_id)?; + get_invocation_status_for_epoch(self, invocation_id, invocation_epoch) + } + fn all_invoked_invocations( &mut self, ) -> Result> + Send> { @@ -354,9 +520,23 @@ impl InvocationStatusTable for PartitionStoreTransaction<'_> { put_invocation_status(self, invocation_id, status) } - async fn delete_invocation_status(&mut self, invocation_id: &InvocationId) -> Result<()> { + async fn archive_invocation_status_to_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + status: &InvocationStatus, + ) -> Result<()> { + self.assert_partition_key(invocation_id)?; + archive_invocation_status_to_epoch(self, invocation_id, invocation_epoch, status) + } + + async fn delete_invocation_status( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: Option, + ) -> Result<()> { self.assert_partition_key(invocation_id)?; - delete_invocation_status(self, invocation_id) + delete_invocation_status(self, invocation_id, invocation_epoch) } } diff --git a/crates/partition-store/src/journal_table/mod.rs b/crates/partition-store/src/journal_table/mod.rs index b132215bf4..e896f5d981 100644 --- a/crates/partition-store/src/journal_table/mod.rs +++ b/crates/partition-store/src/journal_table/mod.rs @@ -21,8 +21,9 @@ use futures_util::stream; use restate_rocksdb::RocksDbPerfGuard; use restate_storage_api::Result; use restate_storage_api::journal_table::{JournalEntry, JournalTable, ReadOnlyJournalTable}; +use restate_storage_api::journal_table_v2::JournalEntryId; use restate_types::identifiers::{ - EntryIndex, InvocationId, InvocationUuid, JournalEntryId, PartitionKey, WithPartitionKey, + EntryIndex, InvocationId, InvocationUuid, PartitionKey, WithPartitionKey, }; use std::io::Cursor; use std::ops::RangeInclusive; @@ -116,6 +117,7 @@ fn all_journals( Ok(( JournalEntryId::from_parts( InvocationId::from_parts(partition_key, invocation_uuid), + 0, entry_index, ), journal_entry, diff --git a/crates/partition-store/src/journal_table_v2/mod.rs b/crates/partition-store/src/journal_table_v2/mod.rs index c1d99e4580..45712f91b7 100644 --- a/crates/partition-store/src/journal_table_v2/mod.rs +++ b/crates/partition-store/src/journal_table_v2/mod.rs @@ -19,12 +19,14 @@ use crate::{TableScan, TableScanIterationDecision}; use anyhow::anyhow; use futures::Stream; use futures_util::stream; +use itertools::Itertools; use restate_rocksdb::RocksDbPerfGuard; -use restate_storage_api::journal_table_v2::{JournalTable, ReadOnlyJournalTable}; +use restate_storage_api::journal_table_v2::{JournalEntryId, JournalTable, ReadOnlyJournalTable}; use restate_storage_api::{Result, StorageError}; use restate_types::identifiers::{ - EntryIndex, InvocationId, InvocationUuid, JournalEntryId, PartitionKey, WithPartitionKey, + EntryIndex, InvocationId, InvocationUuid, PartitionKey, WithInvocationId, WithPartitionKey, }; +use restate_types::invocation::InvocationEpoch; use restate_types::journal_v2::raw::{RawCommand, RawEntry, RawEntryInner}; use restate_types::journal_v2::{CompletionId, EntryMetadata, NotificationId}; use std::collections::HashMap; @@ -40,6 +42,16 @@ define_table_key!( journal_index: u32 ) ); +define_table_key!( + Journal, + KeyKind::ArchivedJournalV2, + ArchivedJournalKey( + partition_key: PartitionKey, + invocation_uuid: InvocationUuid, + invocation_epoch: InvocationEpoch, + journal_index: u32 + ) +); define_table_key!( Journal, @@ -68,8 +80,23 @@ fn write_journal_entry_key(invocation_id: &InvocationId, journal_index: u32) -> .journal_index(journal_index) } +fn write_archived_journal_entry_key( + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + journal_index: u32, +) -> ArchivedJournalKey { + ArchivedJournalKey::default() + .partition_key(invocation_id.partition_key()) + .invocation_uuid(invocation_id.invocation_uuid()) + .invocation_epoch(invocation_epoch) + .journal_index(journal_index) +} + #[derive(Debug, Clone)] -pub struct StoredEntry(pub RawEntry); +pub struct StoredEntry { + pub entry: RawEntry, + pub epoch: InvocationEpoch, +} impl PartitionStoreProtobufValue for StoredEntry { type ProtobufType = crate::protobuf_types::v1::Entry; } @@ -83,6 +110,7 @@ impl PartitionStoreProtobufValue for JournalEntryIndex { fn put_journal_entry( storage: &mut S, invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, journal_index: u32, journal_entry: &RawEntry, related_completion_ids: &[CompletionId], @@ -109,7 +137,10 @@ fn put_journal_entry( storage.put_kv( write_journal_entry_key(invocation_id, journal_index), - &StoredEntry(journal_entry.clone()), + &StoredEntry { + entry: journal_entry.clone(), + epoch: invocation_epoch, + }, ) } @@ -118,9 +149,39 @@ fn get_journal_entry( invocation_id: &InvocationId, journal_index: u32, ) -> Result> { + let _x = RocksDbPerfGuard::new("get-journal-entry"); let key = write_journal_entry_key(invocation_id, journal_index); let opt: Option = storage.get_value(key)?; - Ok(opt.map(|e| e.0)) + Ok(opt.map(|e| e.entry)) +} + +fn get_journal_entry_for_epoch( + storage: &mut S, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + journal_index: u32, +) -> Result> { + let _x = RocksDbPerfGuard::new("get-journal-entry-for-epoch"); + // Try archived first + if let Some(s) = storage.get_value::<_, StoredEntry>(write_archived_journal_entry_key( + invocation_id, + invocation_epoch, + journal_index, + ))? { + return Ok(Some(s.entry)); + } + + // Nope, try to get the latest and check if the epoch is the same + if let Some(s) = storage + .get_value::<_, StoredEntry>(write_journal_entry_key(invocation_id, invocation_epoch))? + { + if s.epoch == invocation_epoch { + return Ok(Some(s.entry)); + } + } + + // Not found + Ok(None) } fn get_journal( @@ -145,7 +206,7 @@ fn get_journal( let entry = StoredEntry::decode(&mut v).map_err(|error| StorageError::Generic(error.into())); - let result = key.and_then(|key| entry.map(|entry| (key, entry.0))); + let result = key.and_then(|key| entry.map(|entry| (key, entry.entry))); n += 1; if n < journal_length { @@ -161,22 +222,64 @@ fn all_journals( storage: &S, range: RangeInclusive, ) -> Result> + Send + use<'_, S>> { - let iter = storage.iterator_from(FullScanPartitionKeyRange::(range))?; - Ok(stream::iter(OwnedIterator::new(iter).map( - |(mut key, mut value)| { - let journal_key = JournalKey::deserialize_from(&mut key)?; - let journal_entry = StoredEntry::decode(&mut value) - .map_err(|err| StorageError::Conversion(err.into()))?; - - let (partition_key, invocation_uuid, entry_index) = journal_key.into_inner_ok_or()?; + let latest_iterator = OwnedIterator::new( + storage.iterator_from(FullScanPartitionKeyRange::(range.clone()))?, + ) + .map(|(mut key, mut value)| { + let journal_key = JournalKey::deserialize_from(&mut key)?; + let journal_entry = + StoredEntry::decode(&mut value).map_err(|err| StorageError::Conversion(err.into()))?; + + let (partition_key, invocation_uuid, entry_index) = journal_key.into_inner_ok_or()?; + + Ok(( + JournalEntryId::from_parts( + InvocationId::from_parts(partition_key, invocation_uuid), + journal_entry.epoch, + entry_index, + ), + journal_entry.entry, + )) + }); - Ok(( - JournalEntryId::from_parts( - InvocationId::from_parts(partition_key, invocation_uuid), - entry_index, - ), - journal_entry.0, - )) + let archived_iterator = OwnedIterator::new( + storage.iterator_from(FullScanPartitionKeyRange::(range))?, + ) + .map(|(mut key, mut value)| { + let journal_key = ArchivedJournalKey::deserialize_from(&mut key)?; + let journal_entry = + StoredEntry::decode(&mut value).map_err(|err| StorageError::Conversion(err.into()))?; + + let (partition_key, invocation_uuid, epoch, entry_index) = + journal_key.into_inner_ok_or()?; + + Ok(( + JournalEntryId::from_parts( + InvocationId::from_parts(partition_key, invocation_uuid), + epoch, + entry_index, + ), + journal_entry.entry, + )) + }); + + Ok(stream::iter(archived_iterator.merge_by( + latest_iterator, + |is1, is2| { + match (is1, is2) { + (Ok((id1, _)), Ok((id2, _))) + if id1.invocation_id() == id2.invocation_id() + && id1.invocation_epoch() == id2.invocation_epoch() => + { + id1.journal_index() <= id2.journal_index() + } + (Ok((id1, _)), Ok((id2, _))) if id1.invocation_id() == id2.invocation_id() => { + id1.invocation_epoch() <= id2.invocation_epoch() + } + (Ok((id1, _)), Ok((id2, _))) => id1.invocation_id() <= id2.invocation_id(), + // Doesn't matter if there's an error in between + (_, _) => true, + } }, ))) } @@ -184,10 +287,35 @@ fn all_journals( fn delete_journal( storage: &mut S, invocation_id: &InvocationId, + invocation_epoch: Option, journal_length: EntryIndex, ) -> Result<()> { let _x = RocksDbPerfGuard::new("delete-journal"); + let Some(invocation_epoch) = invocation_epoch else { + delete_latest_journal(storage, invocation_id, journal_length)?; + return Ok(()); + }; + + // Check if latest journal has the same epoch of the provided one. + let is_latest = storage + .get_value::<_, StoredEntry>(write_journal_entry_key(invocation_id, invocation_epoch))? + .is_some_and(|s| s.epoch == invocation_epoch); + + if is_latest { + delete_latest_journal(storage, invocation_id, journal_length)?; + } else { + delete_archived_journal(storage, invocation_id, invocation_epoch, journal_length)?; + } + + Ok(()) +} + +fn delete_latest_journal( + storage: &mut S, + invocation_id: &InvocationId, + journal_length: EntryIndex, +) -> Result<()> { let mut key = write_journal_entry_key(invocation_id, 0); let k = &mut key; for journal_index in 0..journal_length { @@ -245,6 +373,114 @@ fn delete_journal( Ok(()) } +fn delete_archived_journal( + storage: &mut S, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + journal_length: EntryIndex, +) -> Result<()> { + let mut key = write_archived_journal_entry_key(invocation_id, invocation_epoch, 0); + let k = &mut key; + for journal_index in 0..journal_length { + k.journal_index = Some(journal_index); + storage.delete_key(k)?; + } + + Ok(()) +} + +fn delete_journal_range( + storage: &mut S, + invocation_id: &InvocationId, + from_included: EntryIndex, + to_excluded: EntryIndex, + notification_ids_to_cleanup: &[NotificationId], +) -> Result<()> { + let _x = RocksDbPerfGuard::new("delete-journal-range"); + + // Delete entries + let mut key = write_journal_entry_key(invocation_id, 0); + let k = &mut key; + for journal_index in from_included..to_excluded { + k.journal_index = Some(journal_index); + storage.delete_key(k)?; + } + + // Clean indexes + if !notification_ids_to_cleanup.is_empty() { + let mut notification_id_to_notification_index = + JournalNotificationIdToNotificationIndexKey::default() + .partition_key(invocation_id.partition_key()) + .invocation_uuid(invocation_id.invocation_uuid()); + let mut completion_id_to_command_index = JournalCompletionIdToCommandIndexKey::default() + .partition_key(invocation_id.partition_key()) + .invocation_uuid(invocation_id.invocation_uuid()); + for notification_id_to_cleanup in notification_ids_to_cleanup { + notification_id_to_notification_index.notification_id = + Some(notification_id_to_cleanup.clone()); + storage.delete_key(¬ification_id_to_notification_index)?; + + if let NotificationId::CompletionId(completion_id) = notification_id_to_cleanup { + completion_id_to_command_index.completion_id = Some(*completion_id); + storage.delete_key(&completion_id_to_command_index)?; + } + } + } + + Ok(()) +} + +fn archive_journal_to_epoch( + storage: &mut S, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + journal_length: EntryIndex, +) -> Result<()> { + let _x = RocksDbPerfGuard::new("archive-journal-to-epoch"); + + let mut latest_key = write_journal_entry_key(invocation_id, 0); + let mut archived_key = write_archived_journal_entry_key(invocation_id, invocation_epoch, 0); + for journal_index in 0..journal_length { + latest_key.journal_index = Some(journal_index); + archived_key.journal_index = Some(journal_index); + + let Some(mut entry) = storage.get_value::<_, StoredEntry>(latest_key.clone())? else { + return Err(StorageError::Generic(anyhow!( + "Expected entry to be not empty" + ))); + }; + + entry.epoch = invocation_epoch; + storage.put_kv(archived_key.clone(), &entry)?; + } + + Ok(()) +} + +fn update_current_journal_epoch( + storage: &mut S, + invocation_id: &InvocationId, + new_epoch: InvocationEpoch, + length: EntryIndex, +) -> Result<()> { + let _x = RocksDbPerfGuard::new("update-current-journal-epoch"); + + let mut key = write_journal_entry_key(invocation_id, 0); + for journal_index in 0..length { + key.journal_index = Some(journal_index); + let Some(mut entry) = storage.get_value::<_, StoredEntry>(key.clone())? else { + return Err(StorageError::Generic(anyhow!( + "Expected entry to be not empty" + ))); + }; + + entry.epoch = new_epoch; + storage.put_kv(key.clone(), &entry)?; + } + + Ok(()) +} + fn get_notifications_index( storage: &mut S, invocation_id: InvocationId, @@ -295,7 +531,7 @@ fn get_command_by_completion_id( return Ok(None); } - let entry = opt.unwrap().0; + let entry = opt.unwrap().entry; let entry_ty = entry.ty(); Ok(Some(entry.inner.try_as_command().ok_or_else(|| { StorageError::Conversion(anyhow!( @@ -308,24 +544,34 @@ impl ReadOnlyJournalTable for PartitionStore { async fn get_journal_entry( &mut self, invocation_id: InvocationId, - journal_index: u32, + entry_index: EntryIndex, ) -> Result> { self.assert_partition_key(&invocation_id)?; - let _x = RocksDbPerfGuard::new("get-journal-entry"); - get_journal_entry(self, &invocation_id, journal_index) + get_journal_entry(self, &invocation_id, entry_index) + } + + async fn get_journal_entry_for_epoch( + &mut self, + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + entry_index: EntryIndex, + ) -> Result> { + self.assert_partition_key(&invocation_id)?; + get_journal_entry_for_epoch(self, &invocation_id, invocation_epoch, entry_index) } fn get_journal( &mut self, invocation_id: InvocationId, - journal_length: EntryIndex, + length: EntryIndex, ) -> Result> + Send> { self.assert_partition_key(&invocation_id)?; - Ok(stream::iter(get_journal( - self, - &invocation_id, - journal_length, - )?)) + Ok(stream::iter(get_journal(self, &invocation_id, length)?)) + } + + async fn has_journal(&mut self, invocation_id: &InvocationId) -> Result { + self.assert_partition_key(invocation_id)?; + Ok(get_journal_entry(self, invocation_id, 0)?.is_some()) } fn all_journals( @@ -355,24 +601,35 @@ impl ReadOnlyJournalTable for PartitionStoreTransaction<'_> { async fn get_journal_entry( &mut self, invocation_id: InvocationId, - journal_index: u32, + entry_index: EntryIndex, ) -> Result> { self.assert_partition_key(&invocation_id)?; let _x = RocksDbPerfGuard::new("get-journal-entry"); - get_journal_entry(self, &invocation_id, journal_index) + get_journal_entry(self, &invocation_id, entry_index) + } + + async fn get_journal_entry_for_epoch( + &mut self, + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + entry_index: EntryIndex, + ) -> Result> { + self.assert_partition_key(&invocation_id)?; + get_journal_entry_for_epoch(self, &invocation_id, invocation_epoch, entry_index) } fn get_journal( &mut self, invocation_id: InvocationId, - journal_length: EntryIndex, + length: EntryIndex, ) -> Result> + Send> { self.assert_partition_key(&invocation_id)?; - Ok(stream::iter(get_journal( - self, - &invocation_id, - journal_length, - )?)) + Ok(stream::iter(get_journal(self, &invocation_id, length)?)) + } + + async fn has_journal(&mut self, invocation_id: &InvocationId) -> Result { + self.assert_partition_key(invocation_id)?; + Ok(get_journal_entry(self, invocation_id, 0)?.is_some()) } fn all_journals( @@ -402,22 +659,67 @@ impl JournalTable for PartitionStoreTransaction<'_> { async fn put_journal_entry( &mut self, invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, index: u32, entry: &RawEntry, related_completion_ids: &[CompletionId], ) -> Result<()> { self.assert_partition_key(&invocation_id)?; - put_journal_entry(self, &invocation_id, index, entry, related_completion_ids) + put_journal_entry( + self, + &invocation_id, + invocation_epoch, + index, + entry, + related_completion_ids, + ) + } + + async fn archive_journal_to_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + journal_length: EntryIndex, + ) -> Result<()> { + self.assert_partition_key(invocation_id)?; + archive_journal_to_epoch(self, invocation_id, invocation_epoch, journal_length) + } + + async fn update_current_journal_epoch( + &mut self, + invocation_id: &InvocationId, + new_epoch: InvocationEpoch, + length: EntryIndex, + ) -> Result<()> { + self.assert_partition_key(invocation_id)?; + update_current_journal_epoch(self, invocation_id, new_epoch, length) } async fn delete_journal( &mut self, invocation_id: InvocationId, + invocation_epoch: Option, journal_length: EntryIndex, ) -> Result<()> { self.assert_partition_key(&invocation_id)?; - let _x = RocksDbPerfGuard::new("delete-journal"); - delete_journal(self, &invocation_id, journal_length) + delete_journal(self, &invocation_id, invocation_epoch, journal_length) + } + + async fn delete_journal_range( + &mut self, + invocation_id: InvocationId, + from_included: EntryIndex, + to_excluded: EntryIndex, + notification_ids_to_cleanup: &[NotificationId], + ) -> Result<()> { + self.assert_partition_key(&invocation_id)?; + delete_journal_range( + self, + &invocation_id, + from_included, + to_excluded, + notification_ids_to_cleanup, + ) } } diff --git a/crates/partition-store/src/keys.rs b/crates/partition-store/src/keys.rs index cdd3064dd9..432b8dcdec 100644 --- a/crates/partition-store/src/keys.rs +++ b/crates/partition-store/src/keys.rs @@ -32,8 +32,10 @@ pub enum KeyKind { Inbox, InvocationStatusV1, InvocationStatus, + ArchivedInvocationStatus, Journal, JournalV2, + ArchivedJournalV2, JournalV2NotificationIdToNotificationIndex, JournalV2CompletionIdToCommandIndex, Outbox, @@ -75,10 +77,12 @@ impl KeyKind { KeyKind::Inbox => b"ib", KeyKind::InvocationStatusV1 => b"is", KeyKind::InvocationStatus => b"iS", + KeyKind::ArchivedInvocationStatus => b"iA", KeyKind::Journal => b"jo", KeyKind::JournalV2NotificationIdToNotificationIndex => b"jn", KeyKind::JournalV2CompletionIdToCommandIndex => b"jc", KeyKind::JournalV2 => b"j2", + KeyKind::ArchivedJournalV2 => b"jA", KeyKind::Outbox => b"ob", KeyKind::ServiceStatus => b"ss", KeyKind::State => b"st", @@ -103,8 +107,10 @@ impl KeyKind { b"ib" => Some(KeyKind::Inbox), b"is" => Some(KeyKind::InvocationStatusV1), b"iS" => Some(KeyKind::InvocationStatus), + b"iA" => Some(KeyKind::ArchivedInvocationStatus), b"jo" => Some(KeyKind::Journal), b"j2" => Some(KeyKind::JournalV2), + b"jA" => Some(KeyKind::ArchivedJournalV2), b"jn" => Some(KeyKind::JournalV2NotificationIdToNotificationIndex), b"jc" => Some(KeyKind::JournalV2CompletionIdToCommandIndex), b"ob" => Some(KeyKind::Outbox), diff --git a/crates/partition-store/src/partition_store.rs b/crates/partition-store/src/partition_store.rs index b7a1bd1346..c801244e4a 100644 --- a/crates/partition-store/src/partition_store.rs +++ b/crates/partition-store/src/partition_store.rs @@ -142,7 +142,11 @@ impl TableKind { pub const fn key_kinds(self) -> &'static [KeyKind] { match self { Self::State => &[KeyKind::State], - Self::InvocationStatus => &[KeyKind::InvocationStatusV1, KeyKind::InvocationStatus], + Self::InvocationStatus => &[ + KeyKind::InvocationStatusV1, + KeyKind::InvocationStatus, + KeyKind::ArchivedInvocationStatus, + ], Self::ServiceStatus => &[KeyKind::ServiceStatus], Self::Idempotency => &[KeyKind::Idempotency], Self::Inbox => &[KeyKind::Inbox], @@ -154,6 +158,7 @@ impl TableKind { KeyKind::Journal, KeyKind::InvocationStatus, KeyKind::JournalV2, + KeyKind::ArchivedJournalV2, KeyKind::JournalV2CompletionIdToCommandIndex, KeyKind::JournalV2NotificationIdToNotificationIndex, ], diff --git a/crates/partition-store/src/protobuf_types.rs b/crates/partition-store/src/protobuf_types.rs index 6fef9bd1cf..dba9444bba 100644 --- a/crates/partition-store/src/protobuf_types.rs +++ b/crates/partition-store/src/protobuf_types.rs @@ -623,6 +623,7 @@ pub mod v1 { journal_retention_duration .unwrap_or_default() .try_into()?, + invocation_epoch: current_invocation_epoch, journal_metadata: restate_storage_api::invocation_status_table::JournalMetadata { length: journal_length, commands, @@ -632,6 +633,9 @@ pub mod v1 { deployment_id, service_protocol_version, )?, + completion_range_epoch_map: CompletionRangeEpochMap::from_trim_points( + trim_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) + ), }, )) } @@ -945,12 +949,16 @@ pub mod v1 { invocation_target, created_using_restate_version, source, - execution_time, idempotency_key, + execution_time, + idempotency_key, timestamps, response_result, completion_retention_duration, - journal_retention_duration, journal_metadata, - pinned_deployment + journal_retention_duration, + invocation_epoch, + journal_metadata, + pinned_deployment, + completion_range_epoch_map }, ) => { let (deployment_id, service_protocol_version) = match pinned_deployment { @@ -995,7 +1003,7 @@ pub mod v1 { deployment_id, service_protocol_version, hotfix_apply_cancellation_after_deployment_is_pinned: false, - current_invocation_epoch: 0, + current_invocation_epoch: invocation_epoch, trim_points: vec![], waiting_for_completions: vec![], waiting_for_signal_indexes: vec![], @@ -1061,6 +1069,30 @@ pub mod v1 { } } + impl TryFrom + for crate::invocation_status_table::InvocationStatusV2OnlyEpoch + { + type Error = ConversionError; + + fn try_from( + value: super::InvocationStatusV2OnlyEpoch, + ) -> Result { + Ok(Self { + current_invocation_epoch: value.current_invocation_epoch, + }) + } + } + + impl From + for super::InvocationStatusV2OnlyEpoch + { + fn from(_: crate::invocation_status_table::InvocationStatusV2OnlyEpoch) -> Self { + panic!( + "Unexpected usage of InvocationStatusV2OnlyEpoch, this data structure can be used only for reading, and never for writing" + ) + } + } + impl TryFrom for crate::invocation_status_table::InvocationStatusV1 { type Error = ConversionError; @@ -1604,8 +1636,10 @@ pub mod v1 { completion_retention_duration: std::time::Duration::MAX, execution_time: None, journal_retention_duration: Default::default(), + invocation_epoch: 0, journal_metadata: JournalMetadata::empty(), pinned_deployment: None, + completion_range_epoch_map: Default::default(), }, ) } @@ -1630,6 +1664,8 @@ pub mod v1 { journal_metadata: _, // The old invocation status table doesn't support PinnedDeployment on Completed pinned_deployment: _, + invocation_epoch: _, + completion_range_epoch_map: _, } = value; Completed { @@ -3174,8 +3210,8 @@ pub mod v1 { append_time: value.append_time.into(), }; - Ok(crate::journal_table_v2::StoredEntry( - match EntryType::try_from(value.ty) + Ok(crate::journal_table_v2::StoredEntry { + entry: match EntryType::try_from(value.ty) .map_err(|e| ConversionError::unexpected_enum_variant("ty", e.0))? .try_into()? { @@ -3245,13 +3281,17 @@ pub mod v1 { journal_v2::raw::RawCommand::new(ct, value.content), ), }, - )) + epoch: value.invocation_epoch, + }) } } impl From for Entry { fn from( - crate::journal_table_v2::StoredEntry(raw_entry): crate::journal_table_v2::StoredEntry, + crate::journal_table_v2::StoredEntry { + entry: raw_entry, + epoch, + }: crate::journal_table_v2::StoredEntry, ) -> Self { let ty = EntryType::from(raw_entry.ty()); let append_time = raw_entry.header().append_time.into(); @@ -3306,6 +3346,7 @@ pub mod v1 { append_time, call_or_send_command_metadata, notification_id, + invocation_epoch: epoch, } } } diff --git a/crates/partition-store/src/tests/journal_table_v2_test/mod.rs b/crates/partition-store/src/tests/journal_table_v2_test/mod.rs index 1d5dbdb038..2c8f2ebcb2 100644 --- a/crates/partition-store/src/tests/journal_table_v2_test/mod.rs +++ b/crates/partition-store/src/tests/journal_table_v2_test/mod.rs @@ -93,6 +93,7 @@ async fn populate_sleep_journal(txn: &mut T) { for i in 0..5 { txn.put_journal_entry( MOCK_INVOCATION_ID_1, + 0, i, &mock_sleep_command(i).encode::(), &[i], @@ -103,6 +104,7 @@ async fn populate_sleep_journal(txn: &mut T) { for i in 5..10 { txn.put_journal_entry( MOCK_INVOCATION_ID_1, + 0, i, &mock_sleep_completion(i - 5).encode::(), &[], @@ -184,7 +186,7 @@ async fn sleep_point_lookups(txn: &mut T) { } async fn delete_journal(txn: &mut T, length: usize) { - txn.delete_journal(MOCK_INVOCATION_ID_1, length as u32) + txn.delete_journal(MOCK_INVOCATION_ID_1, None, length as u32) .await .unwrap(); } @@ -245,6 +247,7 @@ async fn test_call_journal() { txn.put_journal_entry( MOCK_INVOCATION_ID_1, 0, + 0, &mock_call_command(0, 1).encode::(), &[0, 1], ) @@ -252,6 +255,7 @@ async fn test_call_journal() { .unwrap(); txn.put_journal_entry( MOCK_INVOCATION_ID_1, + 0, 1, &mock_one_way_call_command(2).encode::(), &[2], @@ -316,6 +320,7 @@ async fn test_event() { txn.put_journal_entry( MOCK_INVOCATION_ID_1, 0, + 0, &Entry::Event(event.clone()).encode::(), &[], ) From a11034345dd2b5b957ba4872550f3a67ea69ec52 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:15:44 +0200 Subject: [PATCH 09/14] Make sure we read from journal table v2, even if the pinned deployment is not set. This is a new condition that can happen with reset, because we don't re-migrate the journal table back to v1. --- .../src/partition/invoker_storage_reader.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/crates/worker/src/partition/invoker_storage_reader.rs b/crates/worker/src/partition/invoker_storage_reader.rs index 9196732d5b..d32861af3e 100644 --- a/crates/worker/src/partition/invoker_storage_reader.rs +++ b/crates/worker/src/partition/invoker_storage_reader.rs @@ -88,10 +88,19 @@ where unsafe { invoked_status.timestamps.modification_time() }, ); - let journal_stream = if invoked_status - .pinned_deployment - .is_some_and(|p| p.service_protocol_version >= ServiceProtocolVersion::V4) - { + let read_journal_table_v2 = match invoked_status.pinned_deployment { + None => { + // Let's just check if there's anything in journal table v2! + journal_table_v2::ReadOnlyJournalTable::has_journal( + &mut self.txn, + invocation_id, + ) + .await? + } + Some(pd) => pd.service_protocol_version >= ServiceProtocolVersion::V4, + }; + + let journal_stream = if read_journal_table_v2 { // If pinned service protocol version exists and >= V4, we need to read from Journal Table V2! journal_table_v2::ReadOnlyJournalTable::get_journal( &mut self.txn, From 0f22f0ce389fc80bd5023fb3dc346cc04374d3bb Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 13:16:08 +0200 Subject: [PATCH 10/14] Mindless plumbing --- Cargo.lock | 2 + .../partition_processor_rpc_client.rs | 50 +++++++- crates/types/src/errors.rs | 6 + crates/types/src/invocation/client.rs | 32 ++++- crates/types/src/journal_v2/notification.rs | 6 + crates/types/src/net/partition_processor.rs | 74 +++++++++-- crates/wal-protocol/src/lib.rs | 14 ++- .../src/partition/leadership/leader_state.rs | 10 ++ crates/worker/src/partition/mod.rs | 36 +++++- .../src/partition/state_machine/actions.rs | 6 +- .../partition/state_machine/entries/mod.rs | 39 ++---- .../state_machine/entries/write_entry.rs | 52 ++++++++ .../lifecycle/migrate_journal_table.rs | 1 + .../partition/state_machine/lifecycle/mod.rs | 5 + .../worker/src/partition/state_machine/mod.rs | 115 +++++++----------- .../partition/state_machine/tests/fixtures.rs | 10 +- .../state_machine/tests/idempotency.rs | 3 + .../partition/state_machine/tests/matchers.rs | 10 +- .../src/partition/state_machine/tests/mod.rs | 79 +++++++++++- .../partition/state_machine/tests/workflow.rs | 2 + tools/xtask/src/main.rs | 18 ++- 21 files changed, 448 insertions(+), 122 deletions(-) create mode 100644 crates/worker/src/partition/state_machine/entries/write_entry.rs diff --git a/Cargo.lock b/Cargo.lock index d3d5305fe5..504064bf85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6476,6 +6476,7 @@ dependencies = [ "restate-errors", "restate-futures-util", "restate-metadata-store", + "restate-serde-util", "restate-service-client", "restate-service-protocol", "restate-storage-query-datafusion", @@ -7288,6 +7289,7 @@ dependencies = [ "futures-util", "googletest", "humantime", + "itertools 0.14.0", "num-bigint", "opentelemetry", "paste", diff --git a/crates/core/src/worker_api/partition_processor_rpc_client.rs b/crates/core/src/worker_api/partition_processor_rpc_client.rs index 8f9868e7f3..15b9be39c7 100644 --- a/crates/core/src/worker_api/partition_processor_rpc_client.rs +++ b/crates/core/src/worker_api/partition_processor_rpc_client.rs @@ -21,9 +21,12 @@ use restate_types::identifiers::{ use restate_types::invocation::client::{ AttachInvocationResponse, CancelInvocationResponse, GetInvocationOutputResponse, InvocationClient, InvocationClientError, InvocationOutput, KillInvocationResponse, - PurgeInvocationResponse, SubmittedInvocationNotification, + PurgeInvocationResponse, RestartInvocationResponse, SubmittedInvocationNotification, +}; +use restate_types::invocation::restart::{ApplyToWorkflowRun, IfRunning}; +use restate_types::invocation::{ + InvocationEpoch, InvocationQuery, InvocationRequest, InvocationResponse, }; -use restate_types::invocation::{InvocationQuery, InvocationRequest, InvocationResponse}; use restate_types::journal_v2::Signal; use restate_types::live::Live; use restate_types::net::codec::EncodeError; @@ -32,6 +35,7 @@ use restate_types::net::partition_processor::{ PartitionProcessorRpcRequest, PartitionProcessorRpcRequestInner, PartitionProcessorRpcResponse, }; use restate_types::partition_table::{FindPartition, PartitionTable, PartitionTableError}; +use std::time::Duration; use tracing::trace; #[derive(Debug, thiserror::Error)] @@ -450,11 +454,15 @@ where &self, request_id: PartitionProcessorRpcRequestId, invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, ) -> Result { let response = self .resolve_partition_id_and_send( request_id, - PartitionProcessorRpcRequestInner::PurgeInvocation { invocation_id }, + PartitionProcessorRpcRequestInner::PurgeInvocation { + invocation_id, + invocation_epoch, + }, ) .await?; @@ -472,11 +480,15 @@ where &self, request_id: PartitionProcessorRpcRequestId, invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, ) -> Result { let response = self .resolve_partition_id_and_send( request_id, - PartitionProcessorRpcRequestInner::PurgeJournal { invocation_id }, + PartitionProcessorRpcRequestInner::PurgeJournal { + invocation_id, + invocation_epoch, + }, ) .await?; @@ -489,4 +501,34 @@ where } }) } + + async fn restart_invocation( + &self, + request_id: PartitionProcessorRpcRequestId, + invocation_id: InvocationId, + if_running: IfRunning, + previous_attempt_retention: Option, + apply_to_workflow_run: ApplyToWorkflowRun, + ) -> Result { + let response = self + .resolve_partition_id_and_send( + request_id, + PartitionProcessorRpcRequestInner::RestartInvocation { + invocation_id, + if_running, + previous_attempt_retention, + apply_to_workflow_run, + }, + ) + .await?; + + Ok(match response { + PartitionProcessorRpcResponse::RestartInvocation(purge_invocation_response) => { + purge_invocation_response.into() + } + _ => { + panic!("Expecting RestartInvocation rpc response") + } + }) + } } diff --git a/crates/types/src/errors.rs b/crates/types/src/errors.rs index 72c18f6e15..3ecc51de4a 100644 --- a/crates/types/src/errors.rs +++ b/crates/types/src/errors.rs @@ -175,6 +175,7 @@ pub mod codes { PROTOCOL_VIOLATION 571 "Protocol violation", CONFLICT 409 "Conflict", NOT_READY 470 "Not ready", + RESTARTED 471 "Restarted", ); } @@ -295,6 +296,11 @@ impl From for InvocationError { pub const KILLED_INVOCATION_ERROR: InvocationError = InvocationError::new_static(codes::ABORTED, "killed"); +pub const RESTARTED_INVOCATION_ERROR: InvocationError = InvocationError::new_static( + codes::RESTARTED, + "The invocation was restarted. You can re-attach to it to retrieve the new result.", +); + // TODO: Once we want to distinguish server side cancellations from user code returning the // UserErrorCode::Cancelled, we need to add a new RestateErrorCode. pub const CANCELED_INVOCATION_ERROR: InvocationError = diff --git a/crates/types/src/invocation/client.rs b/crates/types/src/invocation/client.rs index ea120b218d..29ea90e5ab 100644 --- a/crates/types/src/invocation/client.rs +++ b/crates/types/src/invocation/client.rs @@ -10,10 +10,14 @@ use crate::errors::InvocationError; use crate::identifiers::{InvocationId, PartitionProcessorRpcRequestId}; -use crate::invocation::{InvocationQuery, InvocationRequest, InvocationResponse, InvocationTarget}; +use crate::invocation::restart::{ApplyToWorkflowRun, IfRunning}; +use crate::invocation::{ + InvocationEpoch, InvocationQuery, InvocationRequest, InvocationResponse, InvocationTarget, +}; use crate::journal_v2::Signal; use crate::time::MillisSinceEpoch; use bytes::Bytes; +use std::time::Duration; #[derive(Debug, thiserror::Error)] #[error("{inner}")] @@ -105,6 +109,16 @@ pub enum PurgeInvocationResponse { NotCompleted, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RestartInvocationResponse { + Ok, + NotFound, + StillRunning, + Unsupported, + MissingInput, + NotStarted, +} + /// This trait provides the functionalities to interact with Restate invocations. pub trait InvocationClient { /// Append the invocation to the log, waiting for the PP to emit [`SubmittedInvocationNotification`] when the command is processed. @@ -164,17 +178,29 @@ pub trait InvocationClient { invocation_id: InvocationId, ) -> impl Future> + Send; - /// Purge the given invocation. This cleanups all the state for the given invocation. This command applies only to completed invocations. + /// Purge the given invocation. This cleanups all the state for the given invocation. This command applies only to completed invocations. If the invocation_epoch is the latest, all the previous epochs will be purged as well. fn purge_invocation( &self, request_id: PartitionProcessorRpcRequestId, invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, ) -> impl Future> + Send; - /// Purge the given invocation journal. This cleanups only the journal for the given invocation, retaining the metadata. This command applies only to completed invocations. + /// Purge the given invocation journal. This cleanups only the journal for the given invocation, retaining the metadata. This command applies only to completed invocations. If the invocation_epoch is the latest, all the previous epochs will be purged as well. fn purge_journal( &self, request_id: PartitionProcessorRpcRequestId, invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, ) -> impl Future> + Send; + + /// See [`crate::invocation::restart::Request`]. + fn restart_invocation( + &self, + request_id: PartitionProcessorRpcRequestId, + invocation_id: InvocationId, + if_running: IfRunning, + previous_attempt_retention: Option, + apply_to_workflow_run: ApplyToWorkflowRun, + ) -> impl Future> + Send; } diff --git a/crates/types/src/journal_v2/notification.rs b/crates/types/src/journal_v2/notification.rs index 6a45e26d25..a4ab3d0c8b 100644 --- a/crates/types/src/journal_v2/notification.rs +++ b/crates/types/src/journal_v2/notification.rs @@ -75,6 +75,12 @@ impl From for NotificationType { } } +impl From for EntryType { + fn from(value: CompletionType) -> Self { + EntryType::Notification(value.into()) + } +} + #[enum_dispatch] pub trait NotificationMetadata { fn id(&self) -> NotificationId; diff --git a/crates/types/src/net/partition_processor.rs b/crates/types/src/net/partition_processor.rs index 94e01cd29a..6a52a205b5 100644 --- a/crates/types/src/net/partition_processor.rs +++ b/crates/types/src/net/partition_processor.rs @@ -13,13 +13,15 @@ use crate::identifiers::{ }; use crate::invocation::client::{ CancelInvocationResponse, InvocationOutput, KillInvocationResponse, PurgeInvocationResponse, - SubmittedInvocationNotification, + RestartInvocationResponse, SubmittedInvocationNotification, }; -use crate::invocation::{InvocationQuery, InvocationRequest, InvocationResponse}; +use crate::invocation::restart::{ApplyToWorkflowRun, IfRunning}; +use crate::invocation::{InvocationEpoch, InvocationQuery, InvocationRequest, InvocationResponse}; use crate::journal_v2::Signal; use crate::net::ServiceTag; use crate::net::{default_wire_codec, define_rpc, define_service}; use serde::{Deserialize, Serialize}; +use std::time::Duration; pub struct PartitionLeaderService; @@ -68,10 +70,26 @@ pub enum PartitionProcessorRpcRequestInner { GetInvocationOutput(InvocationQuery, GetInvocationOutputResponseMode), AppendInvocationResponse(InvocationResponse), AppendSignal(InvocationId, Signal), - CancelInvocation { invocation_id: InvocationId }, - KillInvocation { invocation_id: InvocationId }, - PurgeInvocation { invocation_id: InvocationId }, - PurgeJournal { invocation_id: InvocationId }, + CancelInvocation { + invocation_id: InvocationId, + }, + KillInvocation { + invocation_id: InvocationId, + }, + PurgeInvocation { + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + }, + PurgeJournal { + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + }, + RestartInvocation { + invocation_id: InvocationId, + if_running: IfRunning, + previous_attempt_retention: Option, + apply_to_workflow_run: ApplyToWorkflowRun, + }, } impl WithPartitionKey for PartitionProcessorRpcRequestInner { @@ -87,10 +105,13 @@ impl WithPartitionKey for PartitionProcessorRpcRequestInner { PartitionProcessorRpcRequestInner::KillInvocation { invocation_id } => { invocation_id.partition_key() } - PartitionProcessorRpcRequestInner::PurgeInvocation { invocation_id } => { + PartitionProcessorRpcRequestInner::PurgeInvocation { invocation_id, .. } => { invocation_id.partition_key() } - PartitionProcessorRpcRequestInner::PurgeJournal { invocation_id } => { + PartitionProcessorRpcRequestInner::PurgeJournal { invocation_id, .. } => { + invocation_id.partition_key() + } + PartitionProcessorRpcRequestInner::RestartInvocation { invocation_id, .. } => { invocation_id.partition_key() } } @@ -207,6 +228,42 @@ impl From for PurgeInvocationRpcResponse { } } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum RestartInvocationRpcResponse { + Ok, + NotFound, + StillRunning, + Unsupported, + MissingInput, + NotStarted, +} + +impl From for RestartInvocationResponse { + fn from(value: RestartInvocationRpcResponse) -> Self { + match value { + RestartInvocationRpcResponse::Ok => RestartInvocationResponse::Ok, + RestartInvocationRpcResponse::NotFound => RestartInvocationResponse::NotFound, + RestartInvocationRpcResponse::StillRunning => RestartInvocationResponse::StillRunning, + RestartInvocationRpcResponse::Unsupported => RestartInvocationResponse::Unsupported, + RestartInvocationRpcResponse::MissingInput => RestartInvocationResponse::MissingInput, + RestartInvocationRpcResponse::NotStarted => RestartInvocationResponse::NotStarted, + } + } +} + +impl From for RestartInvocationRpcResponse { + fn from(value: RestartInvocationResponse) -> Self { + match value { + RestartInvocationResponse::Ok => RestartInvocationRpcResponse::Ok, + RestartInvocationResponse::NotFound => RestartInvocationRpcResponse::NotFound, + RestartInvocationResponse::StillRunning => RestartInvocationRpcResponse::StillRunning, + RestartInvocationResponse::Unsupported => RestartInvocationRpcResponse::Unsupported, + RestartInvocationResponse::MissingInput => RestartInvocationRpcResponse::MissingInput, + RestartInvocationResponse::NotStarted => RestartInvocationRpcResponse::NotStarted, + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum PartitionProcessorRpcResponse { Appended, @@ -219,4 +276,5 @@ pub enum PartitionProcessorRpcResponse { KillInvocation(KillInvocationRpcResponse), PurgeInvocation(PurgeInvocationRpcResponse), PurgeJournal(PurgeInvocationRpcResponse), + RestartInvocation(RestartInvocationRpcResponse), } diff --git a/crates/wal-protocol/src/lib.rs b/crates/wal-protocol/src/lib.rs index d441822471..6d33bb42d4 100644 --- a/crates/wal-protocol/src/lib.rs +++ b/crates/wal-protocol/src/lib.rs @@ -13,7 +13,7 @@ use restate_types::GenerationalNodeId; use restate_types::identifiers::{LeaderEpoch, PartitionId, PartitionKey, WithPartitionKey}; use restate_types::invocation::{ AttachInvocationRequest, GetInvocationOutputResponse, InvocationResponse, - InvocationTermination, NotifySignalRequest, PurgeInvocationRequest, ServiceInvocation, + InvocationTermination, NotifySignalRequest, PurgeInvocationRequest, ServiceInvocation, restart, }; use restate_types::logs::{HasRecordKeys, Keys, MatchKeyQuery}; use restate_types::message::MessageIndex; @@ -148,6 +148,8 @@ pub enum Command { ProxyThrough(ServiceInvocation), /// Attach to an existing invocation AttachInvocation(AttachInvocationRequest), + /// Restart an invocation + RestartInvocation(restart::Request), // -- Partition processor events for PP /// Invoker is reporting effect(s) from an ongoing invocation. @@ -203,6 +205,7 @@ impl HasRecordKeys for Envelope { } Command::PurgeInvocation(purge) => Keys::Single(purge.invocation_id.partition_key()), Command::PurgeJournal(purge) => Keys::Single(purge.invocation_id.partition_key()), + Command::RestartInvocation(restart) => Keys::Single(restart.partition_key()), Command::Invoke(invoke) => Keys::Single(invoke.partition_key()), // todo: Remove this, or pass the partition key range but filter based on partition-id // on read if needed. @@ -316,6 +319,7 @@ mod envelope { NotifyGetInvocationOutputResponse = 13, // bilrost NotifySignal = 14, // protobuf PurgeJournal = 15, // flexbuffers + RestartInvocation = 16, // flexbuffers } #[derive(bilrost::Message)] @@ -441,6 +445,10 @@ mod envelope { CommandKind::PurgeJournal, Field::encode_serde(StorageCodecKind::FlexbuffersSerde, value), ), + Command::RestartInvocation(value) => ( + CommandKind::RestartInvocation, + Field::encode_serde(StorageCodecKind::FlexbuffersSerde, value), + ), Command::Invoke(value) => { let value = protobuf::ServiceInvocation::from(value.clone()); (CommandKind::Invoke, Field::encode_protobuf(&value)) @@ -530,6 +538,10 @@ mod envelope { codec_or_error!(envelope.command, StorageCodecKind::FlexbuffersSerde); Command::PurgeJournal(envelope.command.decode_serde()?) } + CommandKind::RestartInvocation => { + codec_or_error!(envelope.command, StorageCodecKind::FlexbuffersSerde); + Command::RestartInvocation(envelope.command.decode_serde()?) + } CommandKind::Invoke => { codec_or_error!(envelope.command, StorageCodecKind::Protobuf); let value: protobuf::ServiceInvocation = envelope.command.decode_protobuf()?; diff --git a/crates/worker/src/partition/leadership/leader_state.rs b/crates/worker/src/partition/leadership/leader_state.rs index 7d6c84d292..198dd37e9d 100644 --- a/crates/worker/src/partition/leadership/leader_state.rs +++ b/crates/worker/src/partition/leadership/leader_state.rs @@ -501,6 +501,16 @@ impl LeaderState { ))); } } + Action::ForwardRestartInvocationResponse { + request_id, + response, + } => { + if let Some(response_tx) = self.awaiting_rpc_actions.remove(&request_id) { + response_tx.send(Ok(PartitionProcessorRpcResponse::RestartInvocation( + response.into(), + ))); + } + } } Ok(()) diff --git a/crates/worker/src/partition/mod.rs b/crates/worker/src/partition/mod.rs index ed272a90cd..30379eb0cf 100644 --- a/crates/worker/src/partition/mod.rs +++ b/crates/worker/src/partition/mod.rs @@ -52,6 +52,7 @@ use restate_types::invocation::{ InvocationQuery, InvocationTarget, InvocationTargetType, InvocationTermination, NotifySignalRequest, PurgeInvocationRequest, ResponseResult, ServiceInvocation, ServiceInvocationResponseSink, SubmitNotificationSink, TerminationFlavor, WorkflowHandlerType, + restart, }; use restate_types::logs::MatchKeyQuery; use restate_types::logs::{KeyFilter, Lsn, SequenceNumber}; @@ -782,7 +783,10 @@ where ) .await } - PartitionProcessorRpcRequestInner::PurgeInvocation { invocation_id } => { + PartitionProcessorRpcRequestInner::PurgeInvocation { + invocation_id, + invocation_epoch, + } => { self.leadership_state .handle_rpc_proposal_command( request_id, @@ -793,11 +797,15 @@ where response_sink: Some(InvocationMutationResponseSink::Ingress( IngressInvocationResponseSink { request_id }, )), + invocation_epoch, }), ) .await } - PartitionProcessorRpcRequestInner::PurgeJournal { invocation_id } => { + PartitionProcessorRpcRequestInner::PurgeJournal { + invocation_id, + invocation_epoch, + } => { self.leadership_state .handle_rpc_proposal_command( request_id, @@ -808,6 +816,30 @@ where response_sink: Some(InvocationMutationResponseSink::Ingress( IngressInvocationResponseSink { request_id }, )), + invocation_epoch, + }), + ) + .await + } + PartitionProcessorRpcRequestInner::RestartInvocation { + invocation_id, + if_running, + previous_attempt_retention, + apply_to_workflow_run, + } => { + self.leadership_state + .handle_rpc_proposal_command( + request_id, + response_tx, + invocation_id.partition_key(), + Command::RestartInvocation(restart::Request { + invocation_id, + if_running, + previous_attempt_retention, + apply_to_workflow_run, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { request_id }, + )), }), ) .await diff --git a/crates/worker/src/partition/state_machine/actions.rs b/crates/worker/src/partition/state_machine/actions.rs index bb2bec9c1a..bbb6cf6469 100644 --- a/crates/worker/src/partition/state_machine/actions.rs +++ b/crates/worker/src/partition/state_machine/actions.rs @@ -14,7 +14,7 @@ use restate_storage_api::timer_table::TimerKey; use restate_types::identifiers::{InvocationId, PartitionProcessorRpcRequestId}; use restate_types::invocation::client::{ CancelInvocationResponse, InvocationOutputResponse, KillInvocationResponse, - PurgeInvocationResponse, + PurgeInvocationResponse, RestartInvocationResponse, }; use restate_types::invocation::{InvocationEpoch, InvocationTarget}; use restate_types::journal::Completion; @@ -96,6 +96,10 @@ pub enum Action { request_id: PartitionProcessorRpcRequestId, response: PurgeInvocationResponse, }, + ForwardRestartInvocationResponse { + request_id: PartitionProcessorRpcRequestId, + response: RestartInvocationResponse, + }, } impl Action { diff --git a/crates/worker/src/partition/state_machine/entries/mod.rs b/crates/worker/src/partition/state_machine/entries/mod.rs index 8556fd1c33..93c839c02a 100644 --- a/crates/worker/src/partition/state_machine/entries/mod.rs +++ b/crates/worker/src/partition/state_machine/entries/mod.rs @@ -24,8 +24,8 @@ mod peek_promise_command; mod send_signal_command; mod set_state_command; mod sleep_command; +pub(crate) mod write_entry; -use crate::debug_if_leader; use crate::partition::state_machine::entries::attach_invocation_command::ApplyAttachInvocationCommand; use crate::partition::state_machine::entries::call_commands::{ ApplyCallCommand, ApplyOneWayCallCommand, @@ -44,6 +44,7 @@ use crate::partition::state_machine::entries::peek_promise_command::ApplyPeekPro use crate::partition::state_machine::entries::send_signal_command::ApplySendSignalCommand; use crate::partition::state_machine::entries::set_state_command::ApplySetStateCommand; use crate::partition::state_machine::entries::sleep_command::ApplySleepCommand; +use crate::partition::state_machine::entries::write_entry::WriteJournalEntryCommand; use crate::partition::state_machine::lifecycle::VerifyOrMigrateJournalTableToV2Command; use crate::partition::state_machine::{CommandHandler, Error, StateMachineApplyContext}; use restate_service_protocol_v4::entry_codec::ServiceProtocolV4Codec; @@ -334,36 +335,20 @@ where } }; - // -- Append journal entry - let journal_meta = self + let invocation_epoch = self.invocation_status.get_epoch(); + let journal_metadata = self .invocation_status .get_journal_metadata_mut() .expect("At this point there must be a journal"); - - let entry_index = journal_meta.length; - debug_if_leader!( - ctx.is_leader, - restate.journal.index = entry_index, - restate.invocation.id = %self.invocation_id, - "Write journal entry {:?} to storage", - entry.ty() - ); - - // Store journal entry - JournalTable::put_journal_entry( - ctx.storage, - self.invocation_id, - entry_index, - &entry, - &related_completion_ids, - ) - .await?; - - // Update journal length - journal_meta.length += 1; - if matches!(entry.ty(), EntryType::Command(_)) { - journal_meta.commands += 1; + WriteJournalEntryCommand { + invocation_id: self.invocation_id, + journal_metadata, + invocation_epoch, + entry, + related_completion_ids, } + .apply(ctx) + .await?; } // Update timestamps diff --git a/crates/worker/src/partition/state_machine/entries/write_entry.rs b/crates/worker/src/partition/state_machine/entries/write_entry.rs new file mode 100644 index 0000000000..60433c5258 --- /dev/null +++ b/crates/worker/src/partition/state_machine/entries/write_entry.rs @@ -0,0 +1,52 @@ +use crate::debug_if_leader; +use crate::partition::state_machine::{CommandHandler, Error, StateMachineApplyContext}; +use restate_storage_api::invocation_status_table::JournalMetadata; +use restate_storage_api::journal_table_v2::JournalTable; +use restate_types::identifiers::InvocationId; +use restate_types::invocation::InvocationEpoch; +use restate_types::journal_v2::raw::RawEntry; +use restate_types::journal_v2::{CompletionId, EntryMetadata, EntryType}; + +pub struct WriteJournalEntryCommand<'e> { + pub invocation_id: InvocationId, + pub invocation_epoch: InvocationEpoch, + pub journal_metadata: &'e mut JournalMetadata, + pub entry: RawEntry, + pub related_completion_ids: Vec, +} + +impl<'e, 'ctx: 'e, 's: 'ctx, S> CommandHandler<&'ctx mut StateMachineApplyContext<'s, S>> + for WriteJournalEntryCommand<'e> +where + S: JournalTable, +{ + async fn apply(self, ctx: &'ctx mut StateMachineApplyContext<'s, S>) -> Result<(), Error> { + let entry_index = self.journal_metadata.length; + debug_if_leader!( + ctx.is_leader, + restate.journal.index = entry_index, + restate.invocation.id = %self.invocation_id, + "Write journal entry {:?} to storage", + self.entry.ty() + ); + + // Store journal entry + ctx.storage + .put_journal_entry( + self.invocation_id, + self.invocation_epoch, + entry_index, + &self.entry, + &self.related_completion_ids, + ) + .await?; + + // Update journal length + self.journal_metadata.length += 1; + if matches!(self.entry.ty(), EntryType::Command(_)) { + self.journal_metadata.commands += 1; + } + + Ok(()) + } +} diff --git a/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs b/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs index b46dc83cbd..7e1e85c87e 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/migrate_journal_table.rs @@ -60,6 +60,7 @@ where journal_table_v2::JournalTable::put_journal_entry( ctx.storage, self.invocation_id, + self.metadata.current_invocation_epoch, 0, &new_entry.encode::(), &[], diff --git a/crates/worker/src/partition/state_machine/lifecycle/mod.rs b/crates/worker/src/partition/state_machine/lifecycle/mod.rs index 3590d69b87..ee6bdb4d47 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/mod.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/mod.rs @@ -8,6 +8,7 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0. +mod archive; mod cancel; mod migrate_journal_table; mod notify_get_invocation_output_response; @@ -17,9 +18,12 @@ mod notify_sleep_completion; mod pinned_deployment; mod purge; mod purge_journal; +mod restart; mod resume; mod suspend; +#[allow(unused_imports)] +pub(super) use archive::ArchiveInvocationCommand; pub(super) use cancel::OnCancelCommand; pub(super) use migrate_journal_table::VerifyOrMigrateJournalTableToV2Command; pub(super) use notify_get_invocation_output_response::OnNotifyGetInvocationOutputResponse; @@ -29,5 +33,6 @@ pub(super) use notify_sleep_completion::OnNotifySleepCompletionCommand; pub(super) use pinned_deployment::OnPinnedDeploymentCommand; pub(super) use purge::OnPurgeCommand; pub(super) use purge_journal::OnPurgeJournalCommand; +pub(super) use restart::OnRestartInvocationCommand; pub(super) use resume::ResumeInvocationCommand; pub(super) use suspend::OnSuspendCommand; diff --git a/crates/worker/src/partition/state_machine/mod.rs b/crates/worker/src/partition/state_machine/mod.rs index 77cfe0919f..9980225283 100644 --- a/crates/worker/src/partition/state_machine/mod.rs +++ b/crates/worker/src/partition/state_machine/mod.rs @@ -61,7 +61,6 @@ use restate_types::identifiers::{ use restate_types::identifiers::{IdempotencyId, WithPartitionKey}; use restate_types::invocation::client::{ CancelInvocationResponse, InvocationOutputResponse, KillInvocationResponse, - PurgeInvocationResponse, }; use restate_types::invocation::{ AttachInvocationRequest, IngressInvocationResponseSink, InvocationEpoch, @@ -278,8 +277,27 @@ impl StateMachineApplyContext<'_, S> { if let Some(invocation_target) = status.invocation_target() { Span::current().record_invocation_target(invocation_target); } - if let Some(invocation_metadata) = status.get_invocation_metadata() { - Span::current().record_invocation_epoch(&invocation_metadata.current_invocation_epoch); + Span::current().record_invocation_epoch(&status.get_epoch()); + Ok(status) + } + + async fn get_invocation_status_for_epoch( + &mut self, + invocation_id: &InvocationId, + invocation_epoch: InvocationEpoch, + ) -> Result + where + S: ReadOnlyInvocationStatusTable, + { + Span::current().record_invocation_id(invocation_id); + Span::current().record_invocation_epoch(&invocation_epoch); + let status = self + .storage + .get_invocation_status_for_epoch(invocation_id, invocation_epoch) + .await?; + + if let Some(invocation_target) = status.invocation_target() { + Span::current().record_invocation_target(invocation_target); } Ok(status) } @@ -461,6 +479,7 @@ impl StateMachineApplyContext<'_, S> { lifecycle::OnPurgeCommand { invocation_id: purge_invocation_request.invocation_id, response_sink: purge_invocation_request.response_sink, + invocation_epoch: purge_invocation_request.invocation_epoch, } .apply(self) .await?; @@ -470,6 +489,20 @@ impl StateMachineApplyContext<'_, S> { lifecycle::OnPurgeJournalCommand { invocation_id: purge_invocation_request.invocation_id, response_sink: purge_invocation_request.response_sink, + invocation_epoch: purge_invocation_request.invocation_epoch, + } + .apply(self) + .await?; + Ok(()) + } + Command::RestartInvocation(restart_invocation_request) => { + lifecycle::OnRestartInvocationCommand { + invocation_id: restart_invocation_request.invocation_id, + if_running: restart_invocation_request.if_running, + previous_attempt_retention: restart_invocation_request + .previous_attempt_retention, + apply_to_workflow_run: restart_invocation_request.apply_to_workflow_run, + response_sink: restart_invocation_request.response_sink, } .apply(self) .await?; @@ -1658,6 +1691,7 @@ impl StateMachineApplyContext<'_, S> { lifecycle::OnPurgeCommand { invocation_id, response_sink: None, + invocation_epoch: 0, } .apply(self) .await?; @@ -2013,6 +2047,7 @@ impl StateMachineApplyContext<'_, S> { if journal_retention.is_zero() { self.do_drop_journal( invocation_id, + None, journal_length, should_remove_journal_table_v2, ) @@ -2291,7 +2326,7 @@ impl StateMachineApplyContext<'_, S> { if let Some(service_id) = invocation_metadata.invocation_target.as_keyed_service_id() { - self.do_clear_all_state(service_id, invocation_id).await?; + self.do_clear_all_state(service_id).await?; } else { warn!( "Trying to process entry {} for a target that has no state", @@ -3392,54 +3427,6 @@ impl StateMachineApplyContext<'_, S> { }); } - fn reply_to_purge_invocation( - &mut self, - response_sink: Option, - response: PurgeInvocationResponse, - ) { - if response_sink.is_none() { - return; - } - let InvocationMutationResponseSink::Ingress(IngressInvocationResponseSink { request_id }) = - response_sink.unwrap(); - debug_if_leader!( - self.is_leader, - "Send purge response to request id '{:?}': {:?}", - request_id, - response - ); - - self.action_collector - .push(Action::ForwardPurgeInvocationResponse { - request_id, - response, - }); - } - - fn reply_to_purge_journal( - &mut self, - response_sink: Option, - response: PurgeInvocationResponse, - ) { - if response_sink.is_none() { - return; - } - let InvocationMutationResponseSink::Ingress(IngressInvocationResponseSink { request_id }) = - response_sink.unwrap(); - debug_if_leader!( - self.is_leader, - "Send purge response to request id '{:?}': {:?}", - request_id, - response - ); - - self.action_collector - .push(Action::ForwardPurgeJournalResponse { - request_id, - response, - }); - } - fn send_submit_notification_if_needed( &mut self, invocation_id: InvocationId, @@ -3776,27 +3763,12 @@ impl StateMachineApplyContext<'_, S> { .map_err(Error::Storage) } - #[tracing::instrument( - skip_all, - level="info", - name="clear_all_state", - fields( - restate.invocation.id = %invocation_id, - rpc.service = %service_id.service_name - ) - )] - async fn do_clear_all_state( - &mut self, - service_id: ServiceId, - invocation_id: InvocationId, - ) -> Result<(), Error> + async fn do_clear_all_state(&mut self, service_id: ServiceId) -> Result<(), Error> where S: StateTable, { - debug_if_leader!(self.is_leader, "Effect: Clear all state"); - + debug_if_leader!(self.is_leader, "Clear all state for service {service_id}"); self.storage.delete_all_user_state(&service_id).await?; - Ok(()) } @@ -3872,6 +3844,7 @@ impl StateMachineApplyContext<'_, S> { async fn do_drop_journal( &mut self, invocation_id: InvocationId, + invocation_epoch: Option, journal_length: EntryIndex, should_remove_journal_table_v2: bool, ) -> Result<(), Error> @@ -3888,6 +3861,7 @@ impl StateMachineApplyContext<'_, S> { journal_table_v2::JournalTable::delete_journal( self.storage, invocation_id, + invocation_epoch, journal_length, ) .await @@ -4196,8 +4170,7 @@ enum InvocationStatusProjection { fn should_use_journal_table_v2(status: &InvocationStatus) -> bool { status - .get_invocation_metadata() - .and_then(|im| im.pinned_deployment.as_ref()) + .get_pinned_deployment() .is_some_and(|pinned_deployment| { pinned_deployment.service_protocol_version >= ServiceProtocolVersion::V4 }) diff --git a/crates/worker/src/partition/state_machine/tests/fixtures.rs b/crates/worker/src/partition/state_machine/tests/fixtures.rs index 890efc44a1..36e63881c3 100644 --- a/crates/worker/src/partition/state_machine/tests/fixtures.rs +++ b/crates/worker/src/partition/state_machine/tests/fixtures.rs @@ -113,10 +113,18 @@ pub fn invoker_end_effect_for_epoch( pub fn pinned_deployment( invocation_id: InvocationId, service_protocol_version: ServiceProtocolVersion, +) -> Command { + pinned_deployment_for_epoch(invocation_id, 0, service_protocol_version) +} + +pub fn pinned_deployment_for_epoch( + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + service_protocol_version: ServiceProtocolVersion, ) -> Command { Command::InvokerEffect(InvokerEffect { invocation_id, - invocation_epoch: 0, + invocation_epoch, kind: InvokerEffectKind::PinnedDeployment(PinnedDeployment { deployment_id: DeploymentId::default(), service_protocol_version, diff --git a/crates/worker/src/partition/state_machine/tests/idempotency.rs b/crates/worker/src/partition/state_machine/tests/idempotency.rs index f017c7c75d..28c5f78139 100644 --- a/crates/worker/src/partition/state_machine/tests/idempotency.rs +++ b/crates/worker/src/partition/state_machine/tests/idempotency.rs @@ -241,8 +241,10 @@ async fn complete_already_completed_invocation() { response_result: ResponseResult::Success(response_bytes.clone()), completion_retention_duration: Default::default(), journal_retention_duration: Default::default(), + invocation_epoch: 0, journal_metadata: JournalMetadata::empty(), pinned_deployment: None, + completion_range_epoch_map: Default::default(), }), ) .await @@ -770,6 +772,7 @@ async fn purge_completed_idempotent_invocation() { .apply(Command::PurgeInvocation(PurgeInvocationRequest { invocation_id, response_sink: None, + invocation_epoch: 0, })) .await; assert_that!( diff --git a/crates/worker/src/partition/state_machine/tests/matchers.rs b/crates/worker/src/partition/state_machine/tests/matchers.rs index df9f4cf7e8..61aba1ca5a 100644 --- a/crates/worker/src/partition/state_machine/tests/matchers.rs +++ b/crates/worker/src/partition/state_machine/tests/matchers.rs @@ -28,7 +28,7 @@ pub mod storage { }; use restate_storage_api::journal_table::JournalEntry; use restate_types::identifiers::InvocationId; - use restate_types::invocation::InvocationTarget; + use restate_types::invocation::{InvocationEpoch, InvocationTarget}; use restate_types::journal::Entry; pub fn has_journal_length( @@ -55,6 +55,14 @@ pub mod storage { ) } + pub fn is_epoch(epoch: InvocationEpoch) -> impl Matcher { + property_matcher::internal::property_matcher( + |o: &InvocationStatus| o.get_epoch(), + "get_epoch()", + eq(epoch), + ) + } + pub fn invocation_inbox_entry( invocation_id: InvocationId, invocation_target: &InvocationTarget, diff --git a/crates/worker/src/partition/state_machine/tests/mod.rs b/crates/worker/src/partition/state_machine/tests/mod.rs index 6f1a173992..ecdb46b2dc 100644 --- a/crates/worker/src/partition/state_machine/tests/mod.rs +++ b/crates/worker/src/partition/state_machine/tests/mod.rs @@ -197,6 +197,35 @@ impl TestEnv { .collect() } + pub async fn read_archived_journal_to_vec( + &mut self, + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + journal_length: EntryIndex, + ) -> Vec { + let mut entries = Vec::with_capacity(journal_length as usize); + for i in 0..journal_length { + entries.push( + restate_storage_api::journal_table_v2::ReadOnlyJournalTable::get_journal_entry_for_epoch( + self.storage(), + invocation_id, + invocation_epoch, + i + ).await .expect("storage to be working") + .expect("entry should be here") + ) + } + + entries + .into_iter() + .enumerate() + .map(|(i, e)| { + e.decode::() + .unwrap_or_else(|_| panic!("entry index {i} can be decoded")) + }) + .collect() + } + pub async fn modify_invocation_status( &mut self, invocation_id: InvocationId, @@ -237,10 +266,58 @@ impl TestEnv { assert_eq!(actual_entry_types, expected_entry_types); // Verify we don't go out of bounds + assert_that!( + restate_storage_api::journal_table_v2::ReadOnlyJournalTable::get_journal_entry( + self.storage(), + invocation_id, + expected_entry_types.len() as u32 + ) + .await, + ok(none()) + ); + } + + pub async fn verify_journal_components_for_epoch( + &mut self, + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + entry_types: impl IntoIterator, + ) { + let expected_entry_types = entry_types.into_iter().collect::>(); + let expected_commands = expected_entry_types + .iter() + .filter(|e| e.is_command()) + .count(); assert_that!( self.storage - .get_journal_entry(&invocation_id, expected_entry_types.len() as u32) + .get_invocation_status_for_epoch(&invocation_id, invocation_epoch) .await, + ok(all!( + matchers::storage::has_journal_length(expected_entry_types.len() as u32), + matchers::storage::has_commands(expected_commands as u32), + matchers::storage::is_epoch(invocation_epoch) + )) + ); + let actual_entry_types = self + .read_archived_journal_to_vec( + invocation_id, + invocation_epoch, + expected_entry_types.len() as EntryIndex, + ) + .await + .into_iter() + .map(|e| e.ty()) + .collect::>(); + assert_eq!(actual_entry_types, expected_entry_types); + + // Verify we don't go out of bounds + assert_that!( + restate_storage_api::journal_table_v2::ReadOnlyJournalTable::get_journal_entry_for_epoch( + self.storage(), + invocation_id, + invocation_epoch, + expected_entry_types.len() as u32 + ).await, ok(none()) ); } diff --git a/crates/worker/src/partition/state_machine/tests/workflow.rs b/crates/worker/src/partition/state_machine/tests/workflow.rs index cddfe9296e..302c934cdf 100644 --- a/crates/worker/src/partition/state_machine/tests/workflow.rs +++ b/crates/worker/src/partition/state_machine/tests/workflow.rs @@ -14,6 +14,7 @@ use crate::partition::state_machine::tests::matchers::actions::forward_purge_inv use restate_storage_api::invocation_status_table::CompletedInvocation; use restate_storage_api::service_status_table::ReadOnlyVirtualObjectStatusTable; use restate_types::errors::WORKFLOW_ALREADY_INVOKED_INVOCATION_ERROR; +use restate_types::invocation::client::PurgeInvocationResponse; use restate_types::invocation::{ AttachInvocationRequest, IngressInvocationResponseSink, InvocationQuery, InvocationTarget, PurgeInvocationRequest, @@ -344,6 +345,7 @@ async fn purge_completed_workflow() { response_sink: Some(InvocationMutationResponseSink::Ingress( IngressInvocationResponseSink { request_id }, )), + invocation_epoch: 0, })) .await; assert_that!( diff --git a/tools/xtask/src/main.rs b/tools/xtask/src/main.rs index 715a4f2972..438b1a3e66 100644 --- a/tools/xtask/src/main.rs +++ b/tools/xtask/src/main.rs @@ -28,10 +28,11 @@ use restate_types::identifiers::{InvocationId, PartitionProcessorRpcRequestId, S use restate_types::invocation::client::{ AttachInvocationResponse, CancelInvocationResponse, GetInvocationOutputResponse, InvocationClient, InvocationClientError, InvocationOutput, KillInvocationResponse, - PurgeInvocationResponse, SubmittedInvocationNotification, + PurgeInvocationResponse, RestartInvocationResponse, SubmittedInvocationNotification, }; +use restate_types::invocation::restart::{ApplyToWorkflowRun, IfRunning}; use restate_types::invocation::{ - InvocationQuery, InvocationRequest, InvocationResponse, InvocationTermination, + InvocationEpoch, InvocationQuery, InvocationRequest, InvocationResponse, InvocationTermination, }; use restate_types::journal_v2::Signal; use restate_types::live::Constant; @@ -174,6 +175,7 @@ impl InvocationClient for Mock { &self, _: PartitionProcessorRpcRequestId, _: InvocationId, + _: InvocationEpoch, ) -> impl Future> + Send { pending() } @@ -182,9 +184,21 @@ impl InvocationClient for Mock { &self, _: PartitionProcessorRpcRequestId, _: InvocationId, + _: InvocationEpoch, ) -> impl Future> + Send { pending() } + + fn restart_invocation( + &self, + _: PartitionProcessorRpcRequestId, + _: InvocationId, + _: IfRunning, + _: Option, + _: ApplyToWorkflowRun, + ) -> impl Future> + Send { + pending() + } } async fn generate_rest_api_doc() -> anyhow::Result<()> { From 99256387cc9dd0cc7d0f4603b49fdeceea87f1ff Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 14:02:49 +0200 Subject: [PATCH 11/14] Return the new invocation epoch in restart_invocation API --- crates/admin/src/rest_api/invocations.rs | 42 +++++++++++-------- crates/types/src/invocation/client.rs | 2 +- crates/types/src/net/partition_processor.rs | 10 +++-- .../state_machine/lifecycle/restart.rs | 10 ++--- 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/crates/admin/src/rest_api/invocations.rs b/crates/admin/src/rest_api/invocations.rs index c03a3836d7..3321c92ae5 100644 --- a/crates/admin/src/rest_api/invocations.rs +++ b/crates/admin/src/rest_api/invocations.rs @@ -12,19 +12,20 @@ use super::error::*; use crate::generate_meta_api_error; use crate::rest_api::create_envelope_header; use crate::state::AdminServiceState; +use axum::Json; use axum::extract::{Path, Query, State}; use axum::http::StatusCode; use okapi_operation::*; use restate_types::identifiers::{InvocationId, PartitionProcessorRpcRequestId, WithPartitionKey}; use restate_types::invocation::client::{ - CancelInvocationResponse, InvocationClient, KillInvocationResponse, PurgeInvocationResponse, - RestartInvocationResponse, + self, CancelInvocationResponse, InvocationClient, KillInvocationResponse, + PurgeInvocationResponse, }; use restate_types::invocation::{ InvocationEpoch, InvocationTermination, PurgeInvocationRequest, TerminationFlavor, restart, }; use restate_wal_protocol::{Command, Envelope}; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::sync::Arc; use std::time::Duration; use tracing::warn; @@ -347,6 +348,12 @@ where Ok(()) } +#[derive(Debug, Serialize, JsonSchema)] +pub struct RestartInvocationResponse { + /// The new invocation epoch of the invocation. + pub new_invocation_epoch: InvocationEpoch, +} + /// What to do if the invocation is still running. By default, the running invocation will be killed. #[derive(Default, Debug, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] @@ -466,7 +473,7 @@ pub async fn restart_invocation( previous_attempt_retention, apply_to_workflow_run, }): Query, -) -> Result<(), RestartInvocationError> +) -> Result, RestartInvocationError> where IC: InvocationClient, { @@ -486,23 +493,24 @@ where .await .map_err(InvocationClientError)? { - RestartInvocationResponse::Ok => {} - RestartInvocationResponse::NotFound => { + client::RestartInvocationResponse::Ok { new_epoch } => Ok(RestartInvocationResponse { + new_invocation_epoch: new_epoch, + } + .into()), + client::RestartInvocationResponse::NotFound => { Err(InvocationNotFoundError(invocation_id.to_string()))? } - RestartInvocationResponse::StillRunning => Err(RestartInvocationStillRunningError( - invocation_id.to_string(), - ))?, - RestartInvocationResponse::Unsupported => { + client::RestartInvocationResponse::StillRunning => Err( + RestartInvocationStillRunningError(invocation_id.to_string()), + )?, + client::RestartInvocationResponse::Unsupported => { Err(RestartInvocationUnsupportedError(invocation_id.to_string()))? } - RestartInvocationResponse::MissingInput => Err(RestartInvocationMissingInputError( - invocation_id.to_string(), - ))?, - RestartInvocationResponse::NotStarted => { + client::RestartInvocationResponse::MissingInput => Err( + RestartInvocationMissingInputError(invocation_id.to_string()), + )?, + client::RestartInvocationResponse::NotStarted => { Err(RestartInvocationNotStartedError(invocation_id.to_string()))? } - }; - - Ok(()) + } } diff --git a/crates/types/src/invocation/client.rs b/crates/types/src/invocation/client.rs index 29ea90e5ab..78792913a8 100644 --- a/crates/types/src/invocation/client.rs +++ b/crates/types/src/invocation/client.rs @@ -111,7 +111,7 @@ pub enum PurgeInvocationResponse { #[derive(Debug, Clone, PartialEq, Eq)] pub enum RestartInvocationResponse { - Ok, + Ok { new_epoch: InvocationEpoch }, NotFound, StillRunning, Unsupported, diff --git a/crates/types/src/net/partition_processor.rs b/crates/types/src/net/partition_processor.rs index 6a52a205b5..c682de0d08 100644 --- a/crates/types/src/net/partition_processor.rs +++ b/crates/types/src/net/partition_processor.rs @@ -230,7 +230,7 @@ impl From for PurgeInvocationRpcResponse { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum RestartInvocationRpcResponse { - Ok, + Ok { new_epoch: InvocationEpoch }, NotFound, StillRunning, Unsupported, @@ -241,7 +241,9 @@ pub enum RestartInvocationRpcResponse { impl From for RestartInvocationResponse { fn from(value: RestartInvocationRpcResponse) -> Self { match value { - RestartInvocationRpcResponse::Ok => RestartInvocationResponse::Ok, + RestartInvocationRpcResponse::Ok { new_epoch } => { + RestartInvocationResponse::Ok { new_epoch } + } RestartInvocationRpcResponse::NotFound => RestartInvocationResponse::NotFound, RestartInvocationRpcResponse::StillRunning => RestartInvocationResponse::StillRunning, RestartInvocationRpcResponse::Unsupported => RestartInvocationResponse::Unsupported, @@ -254,7 +256,9 @@ impl From for RestartInvocationResponse { impl From for RestartInvocationRpcResponse { fn from(value: RestartInvocationResponse) -> Self { match value { - RestartInvocationResponse::Ok => RestartInvocationRpcResponse::Ok, + RestartInvocationResponse::Ok { new_epoch } => { + RestartInvocationRpcResponse::Ok { new_epoch } + } RestartInvocationResponse::NotFound => RestartInvocationRpcResponse::NotFound, RestartInvocationResponse::StillRunning => RestartInvocationRpcResponse::StillRunning, RestartInvocationResponse::Unsupported => RestartInvocationRpcResponse::Unsupported, diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart.rs b/crates/worker/src/partition/state_machine/lifecycle/restart.rs index c58b0505ff..063b37fd2e 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/restart.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/restart.rs @@ -315,7 +315,7 @@ where .await?; // Reply to the listener, restart went well - ctx.reply_to_restart_invocation(response_sink, RestartInvocationResponse::Ok); + ctx.reply_to_restart_invocation(response_sink, RestartInvocationResponse::Ok { new_epoch }); Ok(()) } @@ -478,7 +478,7 @@ mod tests { all!( contains(pat!(Action::ForwardRestartInvocationResponse { request_id: eq(restart_request_id), - response: eq(RestartInvocationResponse::Ok) + response: eq(RestartInvocationResponse::Ok { new_epoch: 1 }) })), contains(pat!(Action::Invoke { invocation_id: eq(invocation_id), @@ -603,7 +603,7 @@ mod tests { // Verify the restart response is sent contains(pat!(Action::ForwardRestartInvocationResponse { request_id: eq(restart_request_id), - response: eq(RestartInvocationResponse::Ok) + response: eq(RestartInvocationResponse::Ok { new_epoch: 1 }) })), // Verify the invocation is restarted contains(pat!(Action::Invoke { @@ -787,7 +787,7 @@ mod tests { actions, contains(pat!(Action::ForwardRestartInvocationResponse { request_id: eq(restart_request_id), - response: eq(RestartInvocationResponse::Ok) + response: eq(RestartInvocationResponse::Ok { new_epoch: 1 }) })) ); @@ -927,7 +927,7 @@ mod tests { // Verify the restart response is sent contains(pat!(Action::ForwardRestartInvocationResponse { request_id: eq(restart_request_id), - response: eq(RestartInvocationResponse::Ok) + response: eq(RestartInvocationResponse::Ok { new_epoch: 1 }) })), // Verify the invocation is restarted contains(pat!(Action::Invoke { From 12b8c3aa5fbc6b45243e6b795327b5a222e176a0 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 5 Jun 2025 15:32:25 +0200 Subject: [PATCH 12/14] More thorough test --- .../state_machine/lifecycle/restart.rs | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart.rs b/crates/worker/src/partition/state_machine/lifecycle/restart.rs index 063b37fd2e..6911e654ef 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/restart.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/restart.rs @@ -374,7 +374,8 @@ mod tests { use crate::partition::state_machine::Action; use crate::partition::state_machine::tests::TestEnv; use crate::partition::state_machine::tests::fixtures::{ - invoker_end_effect, invoker_entry_effect, pinned_deployment, + invoker_end_effect, invoker_end_effect_for_epoch, invoker_entry_effect, + invoker_entry_effect_for_epoch, pinned_deployment, pinned_deployment_for_epoch, }; use crate::partition::state_machine::tests::matchers::storage::{ has_commands, has_journal_length, is_epoch, is_variant, @@ -960,6 +961,31 @@ mod tests { ok(eq(InvocationStatus::Free)) ); + // Complete the restarted invocation too + let _ = test_env + .apply_multiple([ + pinned_deployment_for_epoch(invocation_id, 1, ServiceProtocolVersion::V5), + invoker_entry_effect_for_epoch( + invocation_id, + 1, + OutputCommand { + result: OutputResult::Success(Bytes::from_static(b"456")), + name: Default::default(), + }, + ), + invoker_end_effect_for_epoch(invocation_id, 1), + ]) + .await; + + // Nothing left for this invocation + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await, + ok(eq(InvocationStatus::Free)) + ); + test_env.shutdown().await; } From 3695fb2204f6d8349fff4f17f6f639e89246e0f6 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 6 Jun 2025 14:39:14 +0200 Subject: [PATCH 13/14] Rebase change --- crates/types/src/invocation/mod.rs | 3 ++- crates/worker/src/partition/state_machine/lifecycle/restart.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/types/src/invocation/mod.rs b/crates/types/src/invocation/mod.rs index 87f3695870..0dc27503d7 100644 --- a/crates/types/src/invocation/mod.rs +++ b/crates/types/src/invocation/mod.rs @@ -978,7 +978,6 @@ pub struct PurgeInvocationRequest { pub response_sink: Option, /// When epoch is the current/latest epoch, all the other epochs will be cleaned up as well. #[serde(default, skip_serializing_if = "num_traits::Zero::is_zero")] - #[bilrost(6)] pub invocation_epoch: InvocationEpoch, } @@ -1165,12 +1164,14 @@ pub mod restart { /// /// To retain the previous attempt, the new attempt will take the invocation id of the previous attempt, the one used to trigger this reset, /// and old invocation id will take a new randomly generated invocation id. + #[serde(default, skip_serializing_if = "Option::is_none")] pub previous_attempt_retention: Option, /// What to do in case the invocation was a Workflow run (workflow service and workflow handler type) pub apply_to_workflow_run: ApplyToWorkflowRun, /// Where to send the response for this command + #[serde(default, skip_serializing_if = "Option::is_none")] pub response_sink: Option, } diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart.rs b/crates/worker/src/partition/state_machine/lifecycle/restart.rs index 6911e654ef..9e73e0ec76 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/restart.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/restart.rs @@ -297,11 +297,12 @@ where completion_range_epoch_map: invocation.completion_range_epoch_map, journal_metadata: invocation.journal_metadata, hotfix_apply_cancellation_after_deployment_is_pinned: false, + created_using_restate_version: invocation.created_using_restate_version, // Reset the pinned deployment pinned_deployment: None, // Reset timestamps - timestamps: StatusTimestamps::now(), + timestamps: StatusTimestamps::init(), // Reset response sinks response_sinks: Default::default(), }; From a1f82118ce171b7c7bdb31ef34e8a5dda2025889 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 13 Mar 2025 18:55:14 +0100 Subject: [PATCH 14/14] Implement Reset invocation feature --- crates/admin/src/rest_api/error.rs | 14 + crates/admin/src/rest_api/invocations.rs | 172 +++- crates/admin/src/rest_api/mod.rs | 4 + .../partition_processor_rpc_client.rs | 38 +- crates/invoker-impl/src/input_command.rs | 2 +- crates/partition-store/src/protobuf_types.rs | 18 +- .../tests/invocation_status_table_test/mod.rs | 4 +- .../src/invocation_status_table/mod.rs | 94 +- crates/types/src/errors.rs | 4 + crates/types/src/invocation/client.rs | 21 + crates/types/src/invocation/mod.rs | 68 ++ crates/types/src/journal_v2/command.rs | 2 +- crates/types/src/journal_v2/notification.rs | 10 +- crates/types/src/net/partition_processor.rs | 51 +- crates/wal-protocol/src/lib.rs | 17 +- .../src/partition/leadership/leader_state.rs | 10 + crates/worker/src/partition/mod.rs | 27 +- .../src/partition/state_machine/actions.rs | 6 +- .../state_machine/invocation_status_ext.rs | 10 +- .../partition/state_machine/lifecycle/mod.rs | 2 + .../state_machine/lifecycle/reset.rs | 878 ++++++++++++++++++ .../state_machine/lifecycle/restart.rs | 12 +- .../worker/src/partition/state_machine/mod.rs | 60 +- .../tests/invocation_epoch_awareness.rs | 4 +- .../partition/state_machine/tests/matchers.rs | 58 +- tools/xtask/src/main.rs | 18 +- 26 files changed, 1470 insertions(+), 134 deletions(-) create mode 100644 crates/worker/src/partition/state_machine/lifecycle/reset.rs diff --git a/crates/admin/src/rest_api/error.rs b/crates/admin/src/rest_api/error.rs index d26ab814c3..049e5e807f 100644 --- a/crates/admin/src/rest_api/error.rs +++ b/crates/admin/src/rest_api/error.rs @@ -195,6 +195,20 @@ impl_meta_api_error!(RestartInvocationMissingInputError: GONE "The invocation ca pub(crate) struct RestartInvocationNotStartedError(pub(crate) String); impl_meta_api_error!(RestartInvocationNotStartedError: TOO_EARLY "The invocation cannot be restarted because it's not running yet, meaning it might have been scheduled or inboxed."); +#[derive(Debug, thiserror::Error)] +#[error( + "Resetting the invocation '{0}' is not supported, because it was started using the old service protocol." +)] +pub(crate) struct ResetInvocationUnsupportedError(pub(crate) String); +impl_meta_api_error!(ResetInvocationUnsupportedError: UNPROCESSABLE_ENTITY "Resetting the invocation is not supported, because it was started using the old service protocol."); + +#[derive(Debug, thiserror::Error)] +#[error( + "The invocation '{0}' cannot be reset because it's not running. For completed invocations, use restart instead." +)] +pub(crate) struct ResetInvocationNotRunningError(pub(crate) String); +impl_meta_api_error!(ResetInvocationNotRunningError: TOO_EARLY "The invocation cannot be reset because it's not running. For completed invocations, use restart instead."); + // --- Old Meta API errors. Please don't use these anymore. /// This error is used by handlers to propagate API errors, diff --git a/crates/admin/src/rest_api/invocations.rs b/crates/admin/src/rest_api/invocations.rs index 3321c92ae5..d6e5394142 100644 --- a/crates/admin/src/rest_api/invocations.rs +++ b/crates/admin/src/rest_api/invocations.rs @@ -21,8 +21,10 @@ use restate_types::invocation::client::{ self, CancelInvocationResponse, InvocationClient, KillInvocationResponse, PurgeInvocationResponse, }; +use restate_types::invocation::reset::TruncateFrom; use restate_types::invocation::{ - InvocationEpoch, InvocationTermination, PurgeInvocationRequest, TerminationFlavor, restart, + InvocationEpoch, InvocationTermination, PurgeInvocationRequest, TerminationFlavor, reset, + restart, }; use restate_wal_protocol::{Command, Envelope}; use serde::{Deserialize, Serialize}; @@ -514,3 +516,171 @@ where } } } + +#[derive(Default, Debug, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum ResetInvocationApplyToChildInvocations { + Nothing, + /// Kill all the child invocations that have been created after the truncation point + #[default] + Kill, + /// Cancel all the child invocations that have been created after the truncation point + Cancel, +} + +impl From for reset::ApplyToChildInvocations { + fn from(value: ResetInvocationApplyToChildInvocations) -> Self { + match value { + ResetInvocationApplyToChildInvocations::Kill => reset::ApplyToChildInvocations::Kill, + ResetInvocationApplyToChildInvocations::Nothing => { + reset::ApplyToChildInvocations::Nothing + } + ResetInvocationApplyToChildInvocations::Cancel => { + reset::ApplyToChildInvocations::Cancel + } + } + } +} + +#[derive(Default, Debug, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum ResetInvocationApplyToPinnedDeployment { + #[default] + Keep, + /// Clear the pinned deployment. + /// + /// NOTE: If the new picked up deployment doesn't support the current service protocol version, the invocation will remain stuck in a retry loop. Use with caution! + Clear, +} + +impl From for reset::ApplyToPinnedDeployment { + fn from(value: ResetInvocationApplyToPinnedDeployment) -> Self { + match value { + ResetInvocationApplyToPinnedDeployment::Keep => reset::ApplyToPinnedDeployment::Keep, + ResetInvocationApplyToPinnedDeployment::Clear => reset::ApplyToPinnedDeployment::Clear, + } + } +} + +#[derive(Debug, Default, Deserialize, JsonSchema)] +pub struct ResetInvocationParams { + pub truncate_from: Option, + #[serde( + default, + with = "serde_with::As::>" + )] + #[schemars(with = "Option")] + pub previous_attempt_retention: Option, + pub apply_to_child_calls: Option, + pub apply_to_pinned_deployment: Option, +} + +#[derive(Debug, Serialize, JsonSchema)] +pub struct ResetInvocationResponse { + /// The new invocation epoch of the invocation. + pub new_invocation_epoch: InvocationEpoch, +} + +generate_meta_api_error!(ResetInvocationError: [ + InvocationNotFoundError, + InvocationClientError, + InvalidFieldError, + ResetInvocationUnsupportedError, + ResetInvocationNotRunningError +]); + +/// Reset an invocation +#[openapi( + summary = "Reset an invocation", + description = "Reset the given invocation, truncating the progress from the given journal entry index onward and resuming afterward.", + operation_id = "reset_invocation", + tags = "invocation", + parameters( + path( + name = "invocation_id", + description = "Invocation identifier.", + schema = "std::string::String" + ), + query( + name = "truncate_from", + description = "Journal entry index to truncate from, inclusive. The index MUST correspond to a command entry or to a signal notification, and it cannot be zero, otherwise this operation will fail. If not provided, it defaults to 1 (after the first entry).", + required = false, + style = "simple", + allow_empty_value = false, + schema = "u32", + ), + query( + name = "previous_attempt_retention", + description = "If set, it will override the configured completion_retention/journal_retention when the invocation was executed the first time. If none of the completion_retention/journal_retention are configured, and neither this previous_attempt_retention, then the previous attempt won't be retained at all. Can be configured using humantime format or ISO8601.", + required = false, + style = "simple", + allow_empty_value = false, + schema = String, + ), + query( + name = "apply_to_child_calls", + description = "What to do with children calls that have been created after the truncation point. By default, kills all the children calls. This doesn't apply to sends.", + required = false, + style = "simple", + allow_empty_value = false, + schema = ResetInvocationApplyToChildInvocations, + ), + query( + name = "apply_to_pinned_deployment", + description = "What to do with pinned deployment. By default, the current pinned deployment will be kept.", + required = false, + style = "simple", + allow_empty_value = false, + schema = ResetInvocationApplyToPinnedDeployment, + ) + ) +)] +pub async fn reset_invocation( + State(state): State>, + Path(invocation_id): Path, + Query(ResetInvocationParams { + truncate_from, + previous_attempt_retention, + apply_to_child_calls, + apply_to_pinned_deployment, + }): Query, +) -> Result, ResetInvocationError> +where + IC: InvocationClient, +{ + let invocation_id = invocation_id + .parse::() + .map_err(|e| InvalidFieldError("invocation_id", e.to_string()))?; + + match state + .invocation_client + .reset_invocation( + PartitionProcessorRpcRequestId::new(), + invocation_id, + TruncateFrom::EntryIndex { entry_index: truncate_from.unwrap_or(1) }, + previous_attempt_retention, + apply_to_child_calls.unwrap_or_default().into(), + apply_to_pinned_deployment.unwrap_or_default().into(), + ) + .await + .map_err(InvocationClientError)? + { + client::ResetInvocationResponse::Ok { new_epoch } => Ok(ResetInvocationResponse { + new_invocation_epoch: new_epoch, + } + .into()), + + client::ResetInvocationResponse::NotFound => { + Err(InvocationNotFoundError(invocation_id.to_string()))? + } + client::ResetInvocationResponse::Unsupported => { + Err(ResetInvocationUnsupportedError(invocation_id.to_string()))? + } + client::ResetInvocationResponse::NotRunning => { + Err( ResetInvocationNotRunningError(invocation_id.to_string()))? + } + client::ResetInvocationResponse::BadIndex => { + Err(InvalidFieldError("truncate_from", "The index MUST correspond to a command entry or to a signal notification, and it cannot be zero.".to_owned()))? + } + } +} diff --git a/crates/admin/src/rest_api/mod.rs b/crates/admin/src/rest_api/mod.rs index 6faedeffbb..56ad869e0a 100644 --- a/crates/admin/src/rest_api/mod.rs +++ b/crates/admin/src/rest_api/mod.rs @@ -106,6 +106,10 @@ where "/invocations/:invocation_id/restart", patch(openapi_handler!(invocations::restart_invocation)), ) + .route( + "/invocations/:invocation_id/reset", + patch(openapi_handler!(invocations::reset_invocation)), + ) .route( "/subscriptions", post(openapi_handler!(subscriptions::create_subscription)), diff --git a/crates/core/src/worker_api/partition_processor_rpc_client.rs b/crates/core/src/worker_api/partition_processor_rpc_client.rs index 15b9be39c7..ccc1d5e3da 100644 --- a/crates/core/src/worker_api/partition_processor_rpc_client.rs +++ b/crates/core/src/worker_api/partition_processor_rpc_client.rs @@ -21,7 +21,11 @@ use restate_types::identifiers::{ use restate_types::invocation::client::{ AttachInvocationResponse, CancelInvocationResponse, GetInvocationOutputResponse, InvocationClient, InvocationClientError, InvocationOutput, KillInvocationResponse, - PurgeInvocationResponse, RestartInvocationResponse, SubmittedInvocationNotification, + PurgeInvocationResponse, ResetInvocationResponse, RestartInvocationResponse, + SubmittedInvocationNotification, +}; +use restate_types::invocation::reset::{ + ApplyToChildInvocations, ApplyToPinnedDeployment, TruncateFrom, }; use restate_types::invocation::restart::{ApplyToWorkflowRun, IfRunning}; use restate_types::invocation::{ @@ -531,4 +535,36 @@ where } }) } + + async fn reset_invocation( + &self, + request_id: PartitionProcessorRpcRequestId, + invocation_id: InvocationId, + truncate_from: TruncateFrom, + previous_attempt_retention: Option, + apply_to_child_invocations: ApplyToChildInvocations, + apply_to_pinned_deployment: ApplyToPinnedDeployment, + ) -> Result { + let response = self + .resolve_partition_id_and_send( + request_id, + PartitionProcessorRpcRequestInner::ResetInvocation { + invocation_id, + previous_attempt_retention, + apply_to_child_calls: apply_to_child_invocations, + apply_to_pinned_deployment, + truncate_from, + }, + ) + .await?; + + Ok(match response { + PartitionProcessorRpcResponse::ResetInvocation(reset_invocation_response) => { + reset_invocation_response.into() + } + _ => { + panic!("Expecting ResetInvocation rpc response") + } + }) + } } diff --git a/crates/invoker-impl/src/input_command.rs b/crates/invoker-impl/src/input_command.rs index 6f622474ca..b140bedeff 100644 --- a/crates/invoker-impl/src/input_command.rs +++ b/crates/invoker-impl/src/input_command.rs @@ -33,7 +33,7 @@ pub(crate) struct InvokeCommand { pub(crate) enum InputCommand { Invoke(InvokeCommand), // TODO remove this when we remove journal v1 - // Journal V1 doesn't support epochs nor trim and restart + // Journal V1 doesn't support epochs nor reset invocation Completion { partition: PartitionLeaderEpoch, invocation_id: InvocationId, diff --git a/crates/partition-store/src/protobuf_types.rs b/crates/partition-store/src/protobuf_types.rs index dba9444bba..a1a5774b2e 100644 --- a/crates/partition-store/src/protobuf_types.rs +++ b/crates/partition-store/src/protobuf_types.rs @@ -430,7 +430,7 @@ pub mod v1 { deployment_id, service_protocol_version, current_invocation_epoch, - trim_points, + trim_points: truncation_points, waiting_for_completions, waiting_for_signal_indexes, waiting_for_signal_names, @@ -550,8 +550,8 @@ pub mod v1 { idempotency_key: idempotency_key.map(ByteString::from), hotfix_apply_cancellation_after_deployment_is_pinned, current_invocation_epoch, - completion_range_epoch_map: CompletionRangeEpochMap::from_trim_points( - trim_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) + completion_range_epoch_map: CompletionRangeEpochMap::from_truncation_points( + truncation_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) ), }, )) @@ -584,8 +584,8 @@ pub mod v1 { idempotency_key: idempotency_key.map(ByteString::from), hotfix_apply_cancellation_after_deployment_is_pinned, current_invocation_epoch, - completion_range_epoch_map: CompletionRangeEpochMap::from_trim_points( - trim_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) + completion_range_epoch_map: CompletionRangeEpochMap::from_truncation_points( + truncation_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) ), }, waiting_for_notifications: waiting_for_completions @@ -633,8 +633,8 @@ pub mod v1 { deployment_id, service_protocol_version, )?, - completion_range_epoch_map: CompletionRangeEpochMap::from_trim_points( - trim_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) + completion_range_epoch_map: CompletionRangeEpochMap::from_truncation_points( + truncation_points.into_iter().map(|trim_point|(trim_point.completion_id, trim_point.invocation_epoch)) ), }, )) @@ -842,7 +842,7 @@ pub mod v1 { result: None, hotfix_apply_cancellation_after_deployment_is_pinned, current_invocation_epoch, - trim_points: completion_range_epoch_map.into_trim_points_iter().into_iter().map(|(completion_id, invocation_epoch)| JournalTrimPoint { + trim_points: completion_range_epoch_map.into_truncation_points_iter().into_iter().map(|(completion_id, invocation_epoch)| JournalTrimPoint { completion_id, invocation_epoch, }).collect(), @@ -938,7 +938,7 @@ pub mod v1 { result: None, hotfix_apply_cancellation_after_deployment_is_pinned, current_invocation_epoch, - trim_points: completion_range_epoch_map.into_trim_points_iter().into_iter().map(|(completion_id, invocation_epoch)| JournalTrimPoint { + trim_points: completion_range_epoch_map.into_truncation_points_iter().into_iter().map(|(completion_id, invocation_epoch)| JournalTrimPoint { completion_id, invocation_epoch, }).collect(), diff --git a/crates/partition-store/src/tests/invocation_status_table_test/mod.rs b/crates/partition-store/src/tests/invocation_status_table_test/mod.rs index 00f24ce561..747823b462 100644 --- a/crates/partition-store/src/tests/invocation_status_table_test/mod.rs +++ b/crates/partition-store/src/tests/invocation_status_table_test/mod.rs @@ -99,7 +99,7 @@ fn invoked_status(invocation_target: InvocationTarget) -> InvocationStatus { idempotency_key: None, hotfix_apply_cancellation_after_deployment_is_pinned: false, current_invocation_epoch: 1, - completion_range_epoch_map: CompletionRangeEpochMap::from_trim_points([(5, 1)]), + completion_range_epoch_map: CompletionRangeEpochMap::from_truncation_points([(5, 1)]), }) } @@ -126,7 +126,7 @@ fn suspended_status(invocation_target: InvocationTarget) -> InvocationStatus { idempotency_key: None, hotfix_apply_cancellation_after_deployment_is_pinned: false, current_invocation_epoch: 1, - completion_range_epoch_map: CompletionRangeEpochMap::from_trim_points([(5, 1)]), + completion_range_epoch_map: CompletionRangeEpochMap::from_truncation_points([(5, 1)]), }, waiting_for_notifications: HashSet::default(), } diff --git a/crates/storage-api/src/invocation_status_table/mod.rs b/crates/storage-api/src/invocation_status_table/mod.rs index f4752f0c6e..883dd0d20f 100644 --- a/crates/storage-api/src/invocation_status_table/mod.rs +++ b/crates/storage-api/src/invocation_status_table/mod.rs @@ -196,6 +196,18 @@ pub enum InvocationStatus { } impl InvocationStatus { + #[inline] + pub fn discriminant(&self) -> Option { + match self { + InvocationStatus::Scheduled(_) => Some(InvocationStatusDiscriminants::Scheduled), + InvocationStatus::Inboxed(_) => Some(InvocationStatusDiscriminants::Inboxed), + InvocationStatus::Invoked(_) => Some(InvocationStatusDiscriminants::Invoked), + InvocationStatus::Suspended { .. } => Some(InvocationStatusDiscriminants::Suspended), + InvocationStatus::Completed(_) => Some(InvocationStatusDiscriminants::Completed), + InvocationStatus::Free => None, + } + } + #[inline] pub fn invocation_target(&self) -> Option<&InvocationTarget> { match self { @@ -400,18 +412,6 @@ impl InvocationStatus { _ => None, } } - - #[inline] - pub fn discriminant(&self) -> Option { - match self { - InvocationStatus::Scheduled(_) => Some(InvocationStatusDiscriminants::Scheduled), - InvocationStatus::Inboxed(_) => Some(InvocationStatusDiscriminants::Inboxed), - InvocationStatus::Invoked(_) => Some(InvocationStatusDiscriminants::Invoked), - InvocationStatus::Suspended { .. } => Some(InvocationStatusDiscriminants::Suspended), - InvocationStatus::Completed(_) => Some(InvocationStatusDiscriminants::Completed), - InvocationStatus::Free => None, - } - } } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -551,7 +551,7 @@ impl InboxedInvocation { } } -/// This map is used to record trim points and determine whether a completion from an old epoch should be accepted or rejected. +/// This map is used to record truncation points and determine whether a completion from an old epoch should be accepted or rejected. /// /// For more details, see the unit tests below and InvocationStatusExt in the restate-worker module. #[derive(Debug, Clone, PartialEq, Eq)] @@ -567,8 +567,8 @@ impl Default for CompletionRangeEpochMap { } impl CompletionRangeEpochMap { - /// This must use the vec returned by [Self::into_trim_points_iter]. - pub fn from_trim_points( + /// This must use the vec returned by [Self::into_truncation_points_iter]. + pub fn from_truncation_points( serialized_completion_range_epoch_map: impl IntoIterator, ) -> Self { let mut this = Self::default(); @@ -576,14 +576,16 @@ impl CompletionRangeEpochMap { for (first_inclusive_completion_id_of_new_epoch, new_epoch) in serialized_completion_range_epoch_map { - this.add_trim_point(first_inclusive_completion_id_of_new_epoch, new_epoch); + this.add_truncation_point(first_inclusive_completion_id_of_new_epoch, new_epoch); } this } /// Returns a serializable representation of the map - pub fn into_trim_points_iter(self) -> impl Iterator { + pub fn into_truncation_points_iter( + self, + ) -> impl Iterator { debug_assert!( !self.0.is_empty(), "CompletionRangeEpochMap constraint not respected, it must contain at least one range 0..=MAX" @@ -597,11 +599,15 @@ impl CompletionRangeEpochMap { .map(|(range, epoch)| (*range.start(), epoch)) } - pub fn add_trim_point( + pub fn add_truncation_point( &mut self, first_inclusive_completion_id_of_new_epoch: CompletionId, new_epoch: InvocationEpoch, ) { + if first_inclusive_completion_id_of_new_epoch == CompletionId::MAX { + // Nothing to do here + return; + } self.0.insert( first_inclusive_completion_id_of_new_epoch..=CompletionId::MAX, new_epoch, @@ -725,16 +731,9 @@ pub struct CompletedInvocation { pub completion_range_epoch_map: CompletionRangeEpochMap, } -#[derive(PartialEq, Eq)] -pub enum JournalRetentionPolicy { - Retain, - Drop, -} - impl CompletedInvocation { pub fn from_in_flight_invocation_metadata( mut in_flight_invocation_metadata: InFlightInvocationMetadata, - journal_retention_policy: JournalRetentionPolicy, response_result: ResponseResult, ) -> Self { in_flight_invocation_metadata @@ -754,10 +753,13 @@ impl CompletedInvocation { .completion_retention_duration, journal_retention_duration: in_flight_invocation_metadata.journal_retention_duration, invocation_epoch: in_flight_invocation_metadata.current_invocation_epoch, - journal_metadata: if journal_retention_policy == JournalRetentionPolicy::Retain { - in_flight_invocation_metadata.journal_metadata - } else { + journal_metadata: if in_flight_invocation_metadata + .journal_retention_duration + .is_zero() + { JournalMetadata::empty() + } else { + in_flight_invocation_metadata.journal_metadata }, pinned_deployment: in_flight_invocation_metadata.pinned_deployment, completion_range_epoch_map: in_flight_invocation_metadata.completion_range_epoch_map, @@ -952,11 +954,13 @@ mod tests { let expected_trim_points = vec![]; assert_eq!( - map.clone().into_trim_points_iter().collect::>(), + map.clone() + .into_truncation_points_iter() + .collect::>(), expected_trim_points ); assert_eq!( - CompletionRangeEpochMap::from_trim_points(expected_trim_points), + CompletionRangeEpochMap::from_truncation_points(expected_trim_points), map ); } @@ -965,7 +969,7 @@ mod tests { fn trim_at_1() { let mut map = CompletionRangeEpochMap::default(); - map.add_trim_point(1, 1); + map.add_truncation_point(1, 1); // Before 1 is epoch 0, After including 1 is epoch 1 assert_eq!(map.maximum_epoch_for(0), 0); @@ -974,11 +978,13 @@ mod tests { let expected_trim_points = vec![(1, 1)]; assert_eq!( - map.clone().into_trim_points_iter().collect::>(), + map.clone() + .into_truncation_points_iter() + .collect::>(), expected_trim_points ); assert_eq!( - CompletionRangeEpochMap::from_trim_points(expected_trim_points), + CompletionRangeEpochMap::from_truncation_points(expected_trim_points), map ); } @@ -987,7 +993,7 @@ mod tests { fn multiple_trims() { let mut map = CompletionRangeEpochMap::default(); - map.add_trim_point(5, 1); + map.add_truncation_point(5, 1); // 0..=4 -> 0 // 5..=MAX -> 1 @@ -996,7 +1002,7 @@ mod tests { assert_eq!(map.maximum_epoch_for(5), 1); assert_eq!(map.maximum_epoch_for(CompletionId::MAX), 1); - map.add_trim_point(2, 2); + map.add_truncation_point(2, 2); // 0..=1 -> 0 // 2..=MAX -> 2 @@ -1006,7 +1012,7 @@ mod tests { assert_eq!(map.maximum_epoch_for(3), 2); assert_eq!(map.maximum_epoch_for(CompletionId::MAX), 2); - map.add_trim_point(5, 3); + map.add_truncation_point(5, 3); // 0..=1 -> 0 // 2..=4 -> 2 @@ -1021,11 +1027,13 @@ mod tests { let expected_trim_points = vec![(2, 2), (5, 3)]; assert_eq!( - map.clone().into_trim_points_iter().collect::>(), + map.clone() + .into_truncation_points_iter() + .collect::>(), expected_trim_points ); assert_eq!( - CompletionRangeEpochMap::from_trim_points(expected_trim_points), + CompletionRangeEpochMap::from_truncation_points(expected_trim_points), map ); } @@ -1034,7 +1042,7 @@ mod tests { fn trim_same_point_twice() { let mut map = CompletionRangeEpochMap::default(); - map.add_trim_point(2, 1); + map.add_truncation_point(2, 1); // 0..=2 -> 0 // 2..=MAX -> 1 @@ -1043,7 +1051,7 @@ mod tests { assert_eq!(map.maximum_epoch_for(2), 1); assert_eq!(map.maximum_epoch_for(CompletionId::MAX), 1); - map.add_trim_point(2, 2); + map.add_truncation_point(2, 2); // 0..=2 -> 0 // 2..=MAX -> 2 @@ -1054,11 +1062,13 @@ mod tests { let expected_trim_points = vec![(2, 2)]; assert_eq!( - map.clone().into_trim_points_iter().collect::>(), + map.clone() + .into_truncation_points_iter() + .collect::>(), expected_trim_points ); assert_eq!( - CompletionRangeEpochMap::from_trim_points(expected_trim_points), + CompletionRangeEpochMap::from_truncation_points(expected_trim_points), map ); } diff --git a/crates/types/src/errors.rs b/crates/types/src/errors.rs index 3ecc51de4a..33be20c0d6 100644 --- a/crates/types/src/errors.rs +++ b/crates/types/src/errors.rs @@ -176,6 +176,7 @@ pub mod codes { CONFLICT 409 "Conflict", NOT_READY 470 "Not ready", RESTARTED 471 "Restarted", + RESET 472 "Reset", ); } @@ -301,6 +302,9 @@ pub const RESTARTED_INVOCATION_ERROR: InvocationError = InvocationError::new_sta "The invocation was restarted. You can re-attach to it to retrieve the new result.", ); +pub const RESET_INVOCATION_ERROR: InvocationError = + InvocationError::new_static(codes::RESET, "reset"); + // TODO: Once we want to distinguish server side cancellations from user code returning the // UserErrorCode::Cancelled, we need to add a new RestateErrorCode. pub const CANCELED_INVOCATION_ERROR: InvocationError = diff --git a/crates/types/src/invocation/client.rs b/crates/types/src/invocation/client.rs index 78792913a8..1b5ba0d299 100644 --- a/crates/types/src/invocation/client.rs +++ b/crates/types/src/invocation/client.rs @@ -10,6 +10,7 @@ use crate::errors::InvocationError; use crate::identifiers::{InvocationId, PartitionProcessorRpcRequestId}; +use crate::invocation::reset::{ApplyToChildInvocations, ApplyToPinnedDeployment, TruncateFrom}; use crate::invocation::restart::{ApplyToWorkflowRun, IfRunning}; use crate::invocation::{ InvocationEpoch, InvocationQuery, InvocationRequest, InvocationResponse, InvocationTarget, @@ -119,6 +120,15 @@ pub enum RestartInvocationResponse { NotStarted, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ResetInvocationResponse { + Ok { new_epoch: InvocationEpoch }, + NotFound, + Unsupported, + NotRunning, + BadIndex, +} + /// This trait provides the functionalities to interact with Restate invocations. pub trait InvocationClient { /// Append the invocation to the log, waiting for the PP to emit [`SubmittedInvocationNotification`] when the command is processed. @@ -203,4 +213,15 @@ pub trait InvocationClient { previous_attempt_retention: Option, apply_to_workflow_run: ApplyToWorkflowRun, ) -> impl Future> + Send; + + /// See [`crate::invocation::reset::Request`]. + fn reset_invocation( + &self, + request_id: PartitionProcessorRpcRequestId, + invocation_id: InvocationId, + truncate_from: TruncateFrom, + previous_attempt_retention: Option, + apply_to_child_invocations: ApplyToChildInvocations, + apply_to_pinned_deployment: ApplyToPinnedDeployment, + ) -> impl Future> + Send; } diff --git a/crates/types/src/invocation/mod.rs b/crates/types/src/invocation/mod.rs index 0dc27503d7..e51f720b93 100644 --- a/crates/types/src/invocation/mod.rs +++ b/crates/types/src/invocation/mod.rs @@ -1200,6 +1200,74 @@ pub mod restart { } } +/// Reset invocation command. See [reset::Request] +pub mod reset { + use super::*; + + /// Reset an invocation. + /// + /// This will truncate the journal from the given starting point, and will resume the execution on a new invocation attempt. + #[derive(Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub struct Request { + pub invocation_id: InvocationId, + + // Where to truncate the journal from + pub truncate_from: TruncateFrom, + + /// If set, it will override the configured completion_retention/journal_retention when the invocation was executed the first time. + /// If none of the completion_retention/journal_retention are configured, and neither this previous_attempt_retention, then the previous attempt won't be retained at all. + /// + /// To retain the previous attempt, the new attempt will take the invocation id of the previous attempt (the one used to trigger this reset), + /// and the old invocation id will take a new randomly generated invocation id. + pub previous_attempt_retention: Option, + + /// What to do with child calls. This doesn't apply to sends. + pub apply_to_child_calls: ApplyToChildInvocations, + + /// What to do with pinned deployment + pub apply_to_pinned_deployment: ApplyToPinnedDeployment, + + /// Where to send the response for this command + pub response_sink: Option, + } + + impl WithInvocationId for Request { + fn invocation_id(&self) -> InvocationId { + self.invocation_id + } + } + + /// Flavor of the [`Request`]. + #[derive(Debug, Clone, Copy, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub enum TruncateFrom { + EntryIndex { + /// Entry index **inclusive**. + /// + /// Note: the index MUST correspond to a [`journal_v2::Command`] or a [`journal_v2::Signal`], + /// otherwise the command will be ignored. + entry_index: EntryIndex, + }, + } + + #[derive(Default, Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub enum ApplyToChildInvocations { + Nothing, + #[default] + Kill, + Cancel, + } + + #[derive(Default, Debug, Clone, Eq, PartialEq, serde::Serialize, serde::Deserialize)] + pub enum ApplyToPinnedDeployment { + #[default] + Keep, + /// Clear the pinned deployment. + /// + /// NOTE: If the new picked up deployment doesn't support the current service protocol version, the invocation will remain stuck in a retry loop. + Clear, + } +} + mod serde_hacks { //! Module where we hide all the hacks to make back-compat working! diff --git a/crates/types/src/journal_v2/command.rs b/crates/types/src/journal_v2/command.rs index e5b1b3b60b..702f9c48d0 100644 --- a/crates/types/src/journal_v2/command.rs +++ b/crates/types/src/journal_v2/command.rs @@ -193,7 +193,7 @@ pub struct ClearStateCommand { } impl_command_accessors!(ClearState -> [@metadata @from_entry @no_completion]); -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ClearAllStateCommand { pub name: ByteString, } diff --git a/crates/types/src/journal_v2/notification.rs b/crates/types/src/journal_v2/notification.rs index a4ab3d0c8b..40ed8a1194 100644 --- a/crates/types/src/journal_v2/notification.rs +++ b/crates/types/src/journal_v2/notification.rs @@ -19,7 +19,7 @@ use serde::{Deserialize, Serialize}; use std::fmt; /// See [`Notification`]. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, strum::EnumTryAs)] pub enum NotificationId { CompletionId(CompletionId), SignalIndex(SignalIndex), @@ -54,7 +54,7 @@ impl fmt::Display for NotificationId { } } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, derive_more::From)] pub enum NotificationType { Completion(CompletionType), Signal, @@ -69,12 +69,6 @@ impl fmt::Display for NotificationType { } } -impl From for NotificationType { - fn from(value: CompletionType) -> Self { - NotificationType::Completion(value) - } -} - impl From for EntryType { fn from(value: CompletionType) -> Self { EntryType::Notification(value.into()) diff --git a/crates/types/src/net/partition_processor.rs b/crates/types/src/net/partition_processor.rs index c682de0d08..cb1099e6cf 100644 --- a/crates/types/src/net/partition_processor.rs +++ b/crates/types/src/net/partition_processor.rs @@ -13,8 +13,9 @@ use crate::identifiers::{ }; use crate::invocation::client::{ CancelInvocationResponse, InvocationOutput, KillInvocationResponse, PurgeInvocationResponse, - RestartInvocationResponse, SubmittedInvocationNotification, + ResetInvocationResponse, RestartInvocationResponse, SubmittedInvocationNotification, }; +use crate::invocation::reset::{ApplyToChildInvocations, ApplyToPinnedDeployment, TruncateFrom}; use crate::invocation::restart::{ApplyToWorkflowRun, IfRunning}; use crate::invocation::{InvocationEpoch, InvocationQuery, InvocationRequest, InvocationResponse}; use crate::journal_v2::Signal; @@ -90,6 +91,13 @@ pub enum PartitionProcessorRpcRequestInner { previous_attempt_retention: Option, apply_to_workflow_run: ApplyToWorkflowRun, }, + ResetInvocation { + invocation_id: InvocationId, + truncate_from: TruncateFrom, + previous_attempt_retention: Option, + apply_to_child_calls: ApplyToChildInvocations, + apply_to_pinned_deployment: ApplyToPinnedDeployment, + }, } impl WithPartitionKey for PartitionProcessorRpcRequestInner { @@ -114,6 +122,9 @@ impl WithPartitionKey for PartitionProcessorRpcRequestInner { PartitionProcessorRpcRequestInner::RestartInvocation { invocation_id, .. } => { invocation_id.partition_key() } + PartitionProcessorRpcRequestInner::ResetInvocation { invocation_id, .. } => { + invocation_id.partition_key() + } } } } @@ -268,6 +279,43 @@ impl From for RestartInvocationRpcResponse { } } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum ResetInvocationRpcResponse { + Ok { new_epoch: InvocationEpoch }, + NotFound, + Unsupported, + NotRunning, + BadIndex, +} + +impl From for ResetInvocationResponse { + fn from(value: ResetInvocationRpcResponse) -> Self { + match value { + ResetInvocationRpcResponse::Ok { new_epoch } => { + ResetInvocationResponse::Ok { new_epoch } + } + ResetInvocationRpcResponse::NotFound => ResetInvocationResponse::NotFound, + ResetInvocationRpcResponse::Unsupported => ResetInvocationResponse::Unsupported, + ResetInvocationRpcResponse::NotRunning => ResetInvocationResponse::NotRunning, + ResetInvocationRpcResponse::BadIndex => ResetInvocationResponse::BadIndex, + } + } +} + +impl From for ResetInvocationRpcResponse { + fn from(value: ResetInvocationResponse) -> Self { + match value { + ResetInvocationResponse::Ok { new_epoch } => { + ResetInvocationRpcResponse::Ok { new_epoch } + } + ResetInvocationResponse::NotFound => ResetInvocationRpcResponse::NotFound, + ResetInvocationResponse::Unsupported => ResetInvocationRpcResponse::Unsupported, + ResetInvocationResponse::NotRunning => ResetInvocationRpcResponse::NotRunning, + ResetInvocationResponse::BadIndex => ResetInvocationRpcResponse::BadIndex, + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum PartitionProcessorRpcResponse { Appended, @@ -281,4 +329,5 @@ pub enum PartitionProcessorRpcResponse { PurgeInvocation(PurgeInvocationRpcResponse), PurgeJournal(PurgeInvocationRpcResponse), RestartInvocation(RestartInvocationRpcResponse), + ResetInvocation(ResetInvocationRpcResponse), } diff --git a/crates/wal-protocol/src/lib.rs b/crates/wal-protocol/src/lib.rs index 6d33bb42d4..1c280467a6 100644 --- a/crates/wal-protocol/src/lib.rs +++ b/crates/wal-protocol/src/lib.rs @@ -13,7 +13,8 @@ use restate_types::GenerationalNodeId; use restate_types::identifiers::{LeaderEpoch, PartitionId, PartitionKey, WithPartitionKey}; use restate_types::invocation::{ AttachInvocationRequest, GetInvocationOutputResponse, InvocationResponse, - InvocationTermination, NotifySignalRequest, PurgeInvocationRequest, ServiceInvocation, restart, + InvocationTermination, NotifySignalRequest, PurgeInvocationRequest, ServiceInvocation, reset, + restart, }; use restate_types::logs::{HasRecordKeys, Keys, MatchKeyQuery}; use restate_types::message::MessageIndex; @@ -150,6 +151,8 @@ pub enum Command { AttachInvocation(AttachInvocationRequest), /// Restart an invocation RestartInvocation(restart::Request), + /// Reset an ongoing invocation + ResetInvocation(reset::Request), // -- Partition processor events for PP /// Invoker is reporting effect(s) from an ongoing invocation. @@ -219,6 +222,9 @@ impl HasRecordKeys for Envelope { Command::InvocationResponse(response) => Keys::Single(response.partition_key()), Command::NotifySignal(sig) => Keys::Single(sig.partition_key()), Command::NotifyGetInvocationOutputResponse(res) => Keys::Single(res.partition_key()), + Command::ResetInvocation(trim_invocation) => { + Keys::Single(trim_invocation.partition_key()) + } } } } @@ -320,6 +326,7 @@ mod envelope { NotifySignal = 14, // protobuf PurgeJournal = 15, // flexbuffers RestartInvocation = 16, // flexbuffers + ResetInvocation = 17, // flexbuffers } #[derive(bilrost::Message)] @@ -449,6 +456,10 @@ mod envelope { CommandKind::RestartInvocation, Field::encode_serde(StorageCodecKind::FlexbuffersSerde, value), ), + Command::ResetInvocation(value) => ( + CommandKind::ResetInvocation, + Field::encode_serde(StorageCodecKind::FlexbuffersSerde, value), + ), Command::Invoke(value) => { let value = protobuf::ServiceInvocation::from(value.clone()); (CommandKind::Invoke, Field::encode_protobuf(&value)) @@ -542,6 +553,10 @@ mod envelope { codec_or_error!(envelope.command, StorageCodecKind::FlexbuffersSerde); Command::RestartInvocation(envelope.command.decode_serde()?) } + CommandKind::ResetInvocation => { + codec_or_error!(envelope.command, StorageCodecKind::FlexbuffersSerde); + Command::ResetInvocation(envelope.command.decode_serde()?) + } CommandKind::Invoke => { codec_or_error!(envelope.command, StorageCodecKind::Protobuf); let value: protobuf::ServiceInvocation = envelope.command.decode_protobuf()?; diff --git a/crates/worker/src/partition/leadership/leader_state.rs b/crates/worker/src/partition/leadership/leader_state.rs index 198dd37e9d..679d872972 100644 --- a/crates/worker/src/partition/leadership/leader_state.rs +++ b/crates/worker/src/partition/leadership/leader_state.rs @@ -511,6 +511,16 @@ impl LeaderState { ))); } } + Action::ForwardResetInvocationResponse { + request_id, + response, + } => { + if let Some(response_tx) = self.awaiting_rpc_actions.remove(&request_id) { + response_tx.send(Ok(PartitionProcessorRpcResponse::ResetInvocation( + response.into(), + ))); + } + } } Ok(()) diff --git a/crates/worker/src/partition/mod.rs b/crates/worker/src/partition/mod.rs index 30379eb0cf..b19bd9f2ce 100644 --- a/crates/worker/src/partition/mod.rs +++ b/crates/worker/src/partition/mod.rs @@ -52,7 +52,7 @@ use restate_types::invocation::{ InvocationQuery, InvocationTarget, InvocationTargetType, InvocationTermination, NotifySignalRequest, PurgeInvocationRequest, ResponseResult, ServiceInvocation, ServiceInvocationResponseSink, SubmitNotificationSink, TerminationFlavor, WorkflowHandlerType, - restart, + reset, restart, }; use restate_types::logs::MatchKeyQuery; use restate_types::logs::{KeyFilter, Lsn, SequenceNumber}; @@ -844,6 +844,31 @@ where ) .await } + PartitionProcessorRpcRequestInner::ResetInvocation { + invocation_id, + truncate_from, + previous_attempt_retention, + apply_to_child_calls, + apply_to_pinned_deployment, + } => { + self.leadership_state + .handle_rpc_proposal_command( + request_id, + response_tx, + invocation_id.partition_key(), + Command::ResetInvocation(reset::Request { + invocation_id, + previous_attempt_retention, + apply_to_child_calls, + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { request_id }, + )), + truncate_from, + apply_to_pinned_deployment, + }), + ) + .await + } }; } diff --git a/crates/worker/src/partition/state_machine/actions.rs b/crates/worker/src/partition/state_machine/actions.rs index bbb6cf6469..9a125c977d 100644 --- a/crates/worker/src/partition/state_machine/actions.rs +++ b/crates/worker/src/partition/state_machine/actions.rs @@ -14,7 +14,7 @@ use restate_storage_api::timer_table::TimerKey; use restate_types::identifiers::{InvocationId, PartitionProcessorRpcRequestId}; use restate_types::invocation::client::{ CancelInvocationResponse, InvocationOutputResponse, KillInvocationResponse, - PurgeInvocationResponse, RestartInvocationResponse, + PurgeInvocationResponse, ResetInvocationResponse, RestartInvocationResponse, }; use restate_types::invocation::{InvocationEpoch, InvocationTarget}; use restate_types::journal::Completion; @@ -100,6 +100,10 @@ pub enum Action { request_id: PartitionProcessorRpcRequestId, response: RestartInvocationResponse, }, + ForwardResetInvocationResponse { + request_id: PartitionProcessorRpcRequestId, + response: ResetInvocationResponse, + }, } impl Action { diff --git a/crates/worker/src/partition/state_machine/invocation_status_ext.rs b/crates/worker/src/partition/state_machine/invocation_status_ext.rs index 9836bfad91..8d81b28b66 100644 --- a/crates/worker/src/partition/state_machine/invocation_status_ext.rs +++ b/crates/worker/src/partition/state_machine/invocation_status_ext.rs @@ -29,12 +29,12 @@ impl InvocationStatusExt for InvocationStatus { completion_id: CompletionId, ) -> bool { if let Some(im) = self.get_invocation_metadata() { - // We should accept completions for commands we haven't trimmed. + // We should accept completions for commands we haven't truncated. // The data structure completion_range_epoch_map tells us the completion ranges -> maximum invocation epoch mapping. // // When this_completion_invocation_epoch > maximum_epoch_for(completion_id), - // it means this completion instance was generated by a command that is **after** the trim point, - // thus it should be trimmed. + // it means this completion instance was generated by a command that is **after** the truncation point, + // thus it should be truncated. // // See the unit tests of CompletionRangeEpochMap for a good explanation of the different cases. im.completion_range_epoch_map @@ -80,9 +80,9 @@ mod tests { // journal index 1 -> command 1 with completion id 1 // journal index 2 -> command 2 with completion id 2 - // I trim with completion id 2 and bump invocation epoch from 0 to 1 + // I truncate with completion id 2 and bump invocation epoch from 0 to 1 let mut completion_range_epoch_map = CompletionRangeEpochMap::default(); - completion_range_epoch_map.add_trim_point(2, 1); + completion_range_epoch_map.add_truncation_point(2, 1); let is = invocation_status_with(completion_range_epoch_map); // If I get a completion id 2 for epoch 0, I discard it, because it belongs to the previous epoch. diff --git a/crates/worker/src/partition/state_machine/lifecycle/mod.rs b/crates/worker/src/partition/state_machine/lifecycle/mod.rs index ee6bdb4d47..3940e55cbb 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/mod.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/mod.rs @@ -18,6 +18,7 @@ mod notify_sleep_completion; mod pinned_deployment; mod purge; mod purge_journal; +mod reset; mod restart; mod resume; mod suspend; @@ -33,6 +34,7 @@ pub(super) use notify_sleep_completion::OnNotifySleepCompletionCommand; pub(super) use pinned_deployment::OnPinnedDeploymentCommand; pub(super) use purge::OnPurgeCommand; pub(super) use purge_journal::OnPurgeJournalCommand; +pub(super) use reset::OnResetInvocationCommand; pub(super) use restart::OnRestartInvocationCommand; pub(super) use resume::ResumeInvocationCommand; pub(super) use suspend::OnSuspendCommand; diff --git a/crates/worker/src/partition/state_machine/lifecycle/reset.rs b/crates/worker/src/partition/state_machine/lifecycle/reset.rs new file mode 100644 index 0000000000..04e93ae05b --- /dev/null +++ b/crates/worker/src/partition/state_machine/lifecycle/reset.rs @@ -0,0 +1,878 @@ +// Copyright (c) 2023 - 2025 Restate Software, Inc., Restate GmbH. +// All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +use crate::debug_if_leader; +use crate::partition::state_machine::lifecycle::ArchiveInvocationCommand; +use crate::partition::state_machine::{ + Action, CommandHandler, Error, StateMachineApplyContext, should_use_journal_table_v2, +}; +use restate_invoker_api::InvokeInputJournal; +use restate_service_protocol_v4::entry_codec::ServiceProtocolV4Codec; +use restate_storage_api::fsm_table::FsmTable; +use restate_storage_api::invocation_status_table::{ + CompletedInvocation, InvocationStatus, InvocationStatusTable, +}; +use restate_storage_api::journal_table_v2::JournalTable; +use restate_storage_api::outbox_table::{OutboxMessage, OutboxTable}; +use restate_types::errors::RESET_INVOCATION_ERROR; +use restate_types::identifiers::InvocationId; +use restate_types::invocation::client::ResetInvocationResponse; +use restate_types::invocation::reset::{ApplyToChildInvocations, ApplyToPinnedDeployment}; +use restate_types::invocation::{ + IngressInvocationResponseSink, InvocationMutationResponseSink, InvocationTermination, + ResponseResult, TerminationFlavor, +}; +use restate_types::journal_v2::{ + Command, CommandMetadata, CompletionId, EntryIndex, EntryMetadata, EntryType, NotificationId, + NotificationType, +}; +use std::cmp; +use std::time::Duration; +use tracing::{trace, warn}; + +pub struct OnResetInvocationCommand { + pub invocation_id: InvocationId, + pub invocation_status: InvocationStatus, + pub truncation_point_entry_index: EntryIndex, + pub previous_attempt_retention: Option, + pub apply_to_child_calls: ApplyToChildInvocations, + pub apply_to_pinned_deployment: ApplyToPinnedDeployment, + pub response_sink: Option, +} + +impl<'ctx, 's: 'ctx, S> CommandHandler<&'ctx mut StateMachineApplyContext<'s, S>> + for OnResetInvocationCommand +where + S: JournalTable + InvocationStatusTable + OutboxTable + FsmTable, +{ + async fn apply(self, ctx: &'ctx mut StateMachineApplyContext<'s, S>) -> Result<(), Error> { + let OnResetInvocationCommand { + invocation_id, + invocation_status, + truncation_point_entry_index, + previous_attempt_retention, + apply_to_child_calls, + apply_to_pinned_deployment, + response_sink, + } = self; + + if !should_use_journal_table_v2(&invocation_status) { + trace!( + "Received restart command for invocation using the old journal table, this is unsupported." + ); + ctx.reply_to_reset_invocation(response_sink, ResetInvocationResponse::Unsupported); + return Ok(()); + } + + let is_invoked = matches!(invocation_status, InvocationStatus::Invoked(_)); + let Some(mut in_flight_invocation_metadata) = invocation_status.into_invocation_metadata() + else { + debug_if_leader!( + ctx.is_leader, + "Ignoring reset command because the invocation is not invoked nor suspended" + ); + ctx.reply_to_reset_invocation(response_sink, ResetInvocationResponse::NotRunning); + return Ok(()); + }; + + // Validate the command first. The entry index must correspond to a command or signal index. + if truncation_point_entry_index == 0 { + debug_if_leader!( + ctx.is_leader, + "Ignoring reset command because truncation index is 0. You can't remove the input entry." + ); + ctx.reply_to_reset_invocation(response_sink, ResetInvocationResponse::BadIndex); + return Ok(()); + } + let Some(truncation_point_entry) = ctx + .storage + .get_journal_entry(invocation_id, truncation_point_entry_index) + .await? + else { + debug_if_leader!( + ctx.is_leader, + "Ignoring reset command because the given entry index doesn't exist" + ); + ctx.reply_to_reset_invocation(response_sink, ResetInvocationResponse::BadIndex); + return Ok(()); + }; + if !matches!( + truncation_point_entry.ty(), + EntryType::Command(_) | EntryType::Notification(NotificationType::Signal) + ) { + debug_if_leader!( + ctx.is_leader, + "Ignoring reset command because the given entry index doesn't correspond to a command entry, nor to a signal notification" + ); + ctx.reply_to_reset_invocation(response_sink, ResetInvocationResponse::BadIndex); + return Ok(()); + } + + // We need to send an abort signal to the invoker if the invocation was previously invoked + if is_invoked { + ctx.send_abort_invocation_to_invoker( + invocation_id, + in_flight_invocation_metadata.current_invocation_epoch, + ); + } + + // Sanity checks done, let's archive the invocation + ArchiveInvocationCommand { + invocation_id, + completed_invocation: CompletedInvocation::from_in_flight_invocation_metadata( + in_flight_invocation_metadata.clone(), + ResponseResult::Failure(RESET_INVOCATION_ERROR), + ), + previous_attempt_retention_override: previous_attempt_retention, + } + .apply(ctx) + .await?; + + // Let's apply the mutations to pinned deployment + match apply_to_pinned_deployment { + ApplyToPinnedDeployment::Keep => { + trace!("Will keep the pinned deployment"); + } + ApplyToPinnedDeployment::Clear => { + trace!("Clearing the pinned deployment"); + in_flight_invocation_metadata.pinned_deployment = None; + } + } + + debug_if_leader!( + ctx.is_leader, + "Rewriting journal starting from {}, index {}", + truncation_point_entry.ty(), + truncation_point_entry_index + ); + + let new_epoch = in_flight_invocation_metadata.current_invocation_epoch + 1; + + // We need to update the epochs of the journal prefix + ctx.storage + .update_current_journal_epoch(&invocation_id, new_epoch, truncation_point_entry_index) + .await?; + + // Let's run the mark and compact algorithm on the part of the algorithm we wanna drop + let mut minimum_completion_id_of_removed_commands = CompletionId::MAX; + let mut notification_ids_to_forget = vec![]; + let mut commands_removed = 0; + let mut truncated_journal_index = truncation_point_entry_index; + for truncation_pointer in + truncation_point_entry_index..in_flight_invocation_metadata.journal_metadata.length + { + let Some(entry) = ctx + .storage + .get_journal_entry(invocation_id, truncation_pointer) + .await? + else { + warn!("Missing entry at index {truncation_pointer}, this is unexpected"); + return Ok(()); + }; + + let keep_this_entry = match entry.ty() { + EntryType::Command(_) => { + let cmd = entry.decode::()?; + + // Let's make sure minimum_completion_id_of_removed_commands remains minimum + // We need to do this in the loop because the trim_point might be a non-completable entry. + minimum_completion_id_of_removed_commands = cmp::min( + minimum_completion_id_of_removed_commands, + cmd.related_completion_ids() + .into_iter() + .min() + .unwrap_or(CompletionId::MAX), + ); + + notification_ids_to_forget.extend( + cmd.related_completion_ids() + .into_iter() + .map(NotificationId::CompletionId), + ); + commands_removed += 1; + + // We remove the command + trace!("Removing {} at index {}", entry.ty(), truncation_pointer); + + // If it's a call, we need to do something more + if let Command::Call(call_command) = cmd { + let child_invocation_id = call_command.request.invocation_id; + match apply_to_child_calls { + ApplyToChildInvocations::Nothing => { + trace!("Won't kill nor cancel {}", child_invocation_id); + } + ApplyToChildInvocations::Kill => { + trace!("Will kill {}", child_invocation_id); + ctx.handle_outgoing_message(OutboxMessage::InvocationTermination( + InvocationTermination { + invocation_id: child_invocation_id, + flavor: TerminationFlavor::Kill, + response_sink: None, + }, + )) + .await?; + } + ApplyToChildInvocations::Cancel => { + trace!("Will cancel {}", child_invocation_id); + ctx.handle_outgoing_message(OutboxMessage::InvocationTermination( + InvocationTermination { + invocation_id: child_invocation_id, + flavor: TerminationFlavor::Cancel, + response_sink: None, + }, + )) + .await?; + } + } + } + + false + } + EntryType::Notification(NotificationType::Completion(completion_ty)) => { + let completion_id = entry + .inner + .try_as_notification_ref() + .expect("Entry type is notification!") + .id() + .try_as_completion_id() + .expect("Notification is completion id!"); + + if completion_id < minimum_completion_id_of_removed_commands { + // We copy this completion because it belongs to a command before the trim point. + trace!( + "Retaining Completion {} with id {} at index {}", + completion_ty, completion_id, truncation_pointer + ); + true + } else { + // We remove this completion as it belongs to a command after (including) the trim point. + notification_ids_to_forget + .push(NotificationId::CompletionId(completion_id)); + trace!( + "Removing Completion {} with id {} at index {}", + completion_ty, completion_id, truncation_pointer + ); + false + } + } + EntryType::Notification(NotificationType::Signal) => { + // We remove this signal as it belongs to a command after (including) the trim point. + let notification_id = entry + .inner + .try_as_notification_ref() + .expect("Entry type is notification!") + .id(); + trace!( + "Removing Notification Signal with id {} at index {}", + notification_id, truncation_pointer + ); + notification_ids_to_forget.push(notification_id); + false + } + EntryType::Event => { + // We just remove events + trace!("Removing Event at index {}", truncation_pointer); + false + } + }; + + if keep_this_entry { + ctx + .storage + .put_journal_entry(invocation_id, new_epoch, truncated_journal_index, &entry, /* No need to fill this, we only ever re-add back notifications and not commands */&[]) + .await?; + truncated_journal_index += 1; + } + } + + // Time to drop the suffix we don't need we don't need + ctx.storage + .delete_journal_range( + invocation_id, + truncated_journal_index, + in_flight_invocation_metadata.journal_metadata.length, + ¬ification_ids_to_forget, + ) + .await?; + + // Update the epoch and add the trim point to invocation status + in_flight_invocation_metadata.current_invocation_epoch = new_epoch; + in_flight_invocation_metadata + .completion_range_epoch_map + .add_truncation_point(minimum_completion_id_of_removed_commands, new_epoch); + + // Update journal length with the new length and the commands. + in_flight_invocation_metadata.journal_metadata.length = truncated_journal_index; + in_flight_invocation_metadata.journal_metadata.commands -= commands_removed; + + // Rewrite procedure done! We're now back in the game + debug_if_leader!( + ctx.is_leader, + restate.journal.length = in_flight_invocation_metadata.journal_metadata.length, + "Journal rewriting completed, resuming the invocation now" + ); + ctx.invoke( + invocation_id, + in_flight_invocation_metadata, + InvokeInputJournal::NoCachedJournal, + ) + .await?; + + ctx.reply_to_reset_invocation(response_sink, ResetInvocationResponse::Ok { new_epoch }); + Ok(()) + } +} + +impl StateMachineApplyContext<'_, S> { + fn reply_to_reset_invocation( + &mut self, + response_sink: Option, + response: ResetInvocationResponse, + ) { + if response_sink.is_none() { + return; + } + let InvocationMutationResponseSink::Ingress(IngressInvocationResponseSink { request_id }) = + response_sink.unwrap(); + debug_if_leader!( + self.is_leader, + "Send reset response to request id '{:?}': {:?}", + request_id, + response + ); + + self.action_collector + .push(Action::ForwardResetInvocationResponse { + request_id, + response, + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + use std::time::Duration; + + use crate::partition::state_machine::tests::{TestEnv, fixtures, matchers}; + use googletest::{assert_that, prelude::*}; + use restate_storage_api::invocation_status_table::{ + CompletionRangeEpochMap, InFlightInvocationMetadata, InvocationStatusDiscriminants, + ReadOnlyInvocationStatusTable, + }; + use restate_storage_api::journal_table_v2::ReadOnlyJournalTable; + use restate_types::identifiers::PartitionProcessorRpcRequestId; + use restate_types::invocation::reset::TruncateFrom; + use restate_types::invocation::{ + InvocationTarget, NotifySignalRequest, ServiceInvocation, reset, + }; + use restate_types::journal_v2::raw::RawCommand; + use restate_types::journal_v2::{ + ClearAllStateCommand, CommandType, CompletionType, Signal, SignalId, SignalResult, + SleepCommand, + }; + use restate_types::service_protocol::ServiceProtocolVersion; + use restate_types::time::MillisSinceEpoch; + use restate_wal_protocol::timer::TimerKeyValue; + + #[restate_core::test] + async fn reset_with_empty_journal() { + let mut test_env = TestEnv::create().await; + + let invocation_id = fixtures::mock_start_invocation(&mut test_env).await; + fixtures::mock_pinned_deployment_v5(&mut test_env, invocation_id).await; + + for entry_index in 0..=2 { + // None of these should cause any trim to happen, because either is journal out of bound, or tries to trim input entry, which is special cased. + let actions = test_env + .apply(restate_wal_protocol::Command::ResetInvocation( + reset::Request { + invocation_id, + truncate_from: TruncateFrom::EntryIndex { entry_index }, + previous_attempt_retention: None, + apply_to_child_calls: Default::default(), + apply_to_pinned_deployment: Default::default(), + response_sink: None, + }, + )) + .await; + assert_that!(actions, empty()); + test_env + .verify_journal_components(invocation_id, [CommandType::Input.into()]) + .await; + } + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn reset_with_non_completable_entries() { + let mut test_env = TestEnv::create().await; + + let invocation_id = fixtures::mock_start_invocation(&mut test_env).await; + fixtures::mock_pinned_deployment_v5(&mut test_env, invocation_id).await; + + let _ = test_env + .apply_multiple([ + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + // Any command will do fine + ClearAllStateCommand::default(), + ), + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + // Any command will do fine + ClearAllStateCommand::default(), + ), + ]) + .await; + assert_that!( + test_env.storage.get_invocation_status(&invocation_id).await, + // [Input, ClearAllState, ClearAllState] + ok(all!( + matchers::storage::has_journal_length(3), + matchers::storage::has_commands(3) + )) + ); + + let reset_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(restate_wal_protocol::Command::ResetInvocation( + reset::Request { + invocation_id, + truncate_from: TruncateFrom::EntryIndex { entry_index: 2 }, + previous_attempt_retention: None, + apply_to_child_calls: Default::default(), + apply_to_pinned_deployment: Default::default(), + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: reset_request_id, + }, + )), + }, + )) + .await; + assert_that!( + actions, + all!( + contains(matchers::actions::invoke_for_id_and_epoch(invocation_id, 1)), + contains(pat!(Action::ForwardResetInvocationResponse { + request_id: eq(reset_request_id), + response: eq(ResetInvocationResponse::Ok { new_epoch: 1 }) + })), + ) + ); + assert_that!( + test_env.storage.get_invocation_status(&invocation_id).await, + // Only Input entry and first clear state + ok(all!( + matchers::storage::status(InvocationStatusDiscriminants::Invoked), + matchers::storage::in_flight_meta(pat!(InFlightInvocationMetadata { + current_invocation_epoch: eq(1), + // There were no completable entries among the trimmed ones, so this map should be unchanged. + completion_range_epoch_map: eq(CompletionRangeEpochMap::default()) + })) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [CommandType::Input.into(), CommandType::ClearAllState.into()], + ) + .await; + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn reset_with_completable_entries() { + let mut test_env = TestEnv::create().await; + + let invocation_id = fixtures::mock_start_invocation(&mut test_env).await; + fixtures::mock_pinned_deployment_v5(&mut test_env, invocation_id).await; + + let wake_up_time = MillisSinceEpoch::now(); + + let _ = test_env + .apply_multiple([ + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + SleepCommand { + wake_up_time, + completion_id: 1, + name: Default::default(), + }, + ), + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + SleepCommand { + wake_up_time: wake_up_time + Duration::from_secs(60), + completion_id: 2, + name: Default::default(), + }, + ), + ]) + .await; + assert_that!( + test_env.storage.get_invocation_status(&invocation_id).await, + // [Input, SleepCommand, SleepCommand] + ok(matchers::storage::has_journal_length(3)) + ); + + // Let's complete one of the sleeps + let _ = test_env + .apply(restate_wal_protocol::Command::Timer( + TimerKeyValue::complete_journal_entry(wake_up_time, invocation_id, 1, 0), + )) + .await; + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::Sleep.into(), + CommandType::Sleep.into(), + CompletionType::Sleep.into(), + ], + ) + .await; + + let actions = test_env + .apply(restate_wal_protocol::Command::ResetInvocation( + reset::Request { + invocation_id, + truncate_from: TruncateFrom::EntryIndex { entry_index: 2 }, + previous_attempt_retention: None, + apply_to_child_calls: Default::default(), + apply_to_pinned_deployment: Default::default(), + response_sink: None, + }, + )) + .await; + assert_that!( + actions, + contains(matchers::actions::invoke_for_id_and_epoch(invocation_id, 1)) + ); + assert_that!( + test_env.storage.get_invocation_status(&invocation_id).await, + // Only Input entry and first clear state + ok(all!( + matchers::storage::status(InvocationStatusDiscriminants::Invoked), + matchers::storage::in_flight_meta(pat!(InFlightInvocationMetadata { + current_invocation_epoch: eq(1), + // This should contain the trim point! + completion_range_epoch_map: eq( + CompletionRangeEpochMap::from_truncation_points([(2, 1)]) + ) + })) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::Sleep.into(), + CompletionType::Sleep.into(), + ], + ) + .await; + assert_that!( + test_env + .storage + .get_command_by_completion_id(invocation_id, 2) + .await, + // This was the second Sleep + ok(none()) + ); + assert_that!( + test_env + .storage + .get_command_by_completion_id(invocation_id, 1) + .await, + // This was the first + ok(some(property!( + RawCommand.ty(), + eq(EntryType::Command(CommandType::Sleep)) + ))) + ); + assert_that!( + test_env + .storage + .get_notifications_index(invocation_id) + .await, + // First notification is there + ok(eq(HashMap::from([(NotificationId::CompletionId(1), 2u32)]))) + ); + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn reset_from_signal_notification() { + let mut test_env = TestEnv::create().await; + + let invocation_id = fixtures::mock_start_invocation(&mut test_env).await; + fixtures::mock_pinned_deployment_v5(&mut test_env, invocation_id).await; + + let wake_up_time = MillisSinceEpoch::now(); + + let _ = test_env + .apply_multiple([ + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + SleepCommand { + wake_up_time, + completion_id: 1, + name: Default::default(), + }, + ), + restate_wal_protocol::Command::NotifySignal(NotifySignalRequest { + invocation_id, + signal: Signal::new(SignalId::for_index(17), SignalResult::Void), + }), + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + SleepCommand { + wake_up_time: wake_up_time + Duration::from_secs(60), + completion_id: 2, + name: Default::default(), + }, + ), + ]) + .await; + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::Sleep.into(), + NotificationType::Signal.into(), + CommandType::Sleep.into(), + ], + ) + .await; + + // Let's complete one of the sleeps + let _ = test_env + .apply(restate_wal_protocol::Command::Timer( + TimerKeyValue::complete_journal_entry(wake_up_time, invocation_id, 1, 0), + )) + .await; + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::Sleep.into(), + NotificationType::Signal.into(), + CommandType::Sleep.into(), + CompletionType::Sleep.into(), + ], + ) + .await; + + let actions = test_env + .apply(restate_wal_protocol::Command::ResetInvocation( + reset::Request { + invocation_id, + truncate_from: TruncateFrom::EntryIndex { entry_index: 2 }, + previous_attempt_retention: None, + apply_to_child_calls: Default::default(), + apply_to_pinned_deployment: Default::default(), + response_sink: None, + }, + )) + .await; + assert_that!( + actions, + contains(matchers::actions::invoke_for_id_and_epoch(invocation_id, 1)) + ); + assert_that!( + test_env.storage.get_invocation_status(&invocation_id).await, + ok(all!( + matchers::storage::status(InvocationStatusDiscriminants::Invoked), + matchers::storage::in_flight_meta(pat!(InFlightInvocationMetadata { + current_invocation_epoch: eq(1), + // This should contain the trim point! + completion_range_epoch_map: eq( + CompletionRangeEpochMap::from_truncation_points([(2, 1)]) + ) + })) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::Sleep.into(), + CompletionType::Sleep.into(), + ], + ) + .await; + assert_that!( + test_env + .storage + .get_command_by_completion_id(invocation_id, 2) + .await, + // This was the second Sleep + ok(none()) + ); + assert_that!( + test_env + .storage + .get_command_by_completion_id(invocation_id, 1) + .await, + // This was the first + ok(some(property!( + RawCommand.ty(), + eq(EntryType::Command(CommandType::Sleep)) + ))) + ); + assert_that!( + test_env + .storage + .get_notifications_index(invocation_id) + .await, + // First notification is there + ok(eq(HashMap::from([(NotificationId::CompletionId(1), 2u32)]))) + ); + + test_env.shutdown().await; + } + + #[restate_core::test] + async fn reset_with_retention() { + let mut test_env = TestEnv::create().await; + + let retention = Duration::from_secs(60) * 60 * 24; + let invocation_target = InvocationTarget::mock_service(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + + let _ = test_env + .apply(restate_wal_protocol::Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + completion_retention_duration: retention, + journal_retention_duration: retention, + ..ServiceInvocation::mock() + })) + .await; + + let _ = test_env + .apply_multiple([ + fixtures::pinned_deployment(invocation_id, ServiceProtocolVersion::V5), + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + // Any command will do fine + ClearAllStateCommand::default(), + ), + fixtures::invoker_entry_effect_for_epoch( + invocation_id, + 0, + // Any command will do fine + ClearAllStateCommand::default(), + ), + ]) + .await; + test_env + .verify_journal_components( + invocation_id, + [ + CommandType::Input.into(), + CommandType::ClearAllState.into(), + CommandType::ClearAllState.into(), + ], + ) + .await; + + let reset_request_id = PartitionProcessorRpcRequestId::default(); + let actions = test_env + .apply(restate_wal_protocol::Command::ResetInvocation( + reset::Request { + invocation_id, + truncate_from: TruncateFrom::EntryIndex { entry_index: 2 }, + previous_attempt_retention: None, + apply_to_child_calls: Default::default(), + apply_to_pinned_deployment: Default::default(), + response_sink: Some(InvocationMutationResponseSink::Ingress( + IngressInvocationResponseSink { + request_id: reset_request_id, + }, + )), + }, + )) + .await; + + // Verify the reset worked correctly + assert_that!( + actions, + all!( + contains(matchers::actions::invoke_for_id_and_epoch(invocation_id, 1)), + contains(pat!(Action::ForwardResetInvocationResponse { + request_id: eq(reset_request_id), + response: eq(ResetInvocationResponse::Ok { new_epoch: 1 }) + })), + ) + ); + assert_that!( + test_env.storage.get_invocation_status(&invocation_id).await, + // Only Input entry and first clear state + ok(all!( + matchers::storage::status(InvocationStatusDiscriminants::Invoked), + matchers::storage::is_epoch(1), + matchers::storage::in_flight_meta(pat!(InFlightInvocationMetadata { + // There were no completable entries among the trimmed ones, so this map should be unchanged. + completion_range_epoch_map: eq(CompletionRangeEpochMap::default()) + })) + )) + ); + test_env + .verify_journal_components( + invocation_id, + [CommandType::Input.into(), CommandType::ClearAllState.into()], + ) + .await; + + // Verify we have the archived invocation + assert_that!( + test_env + .storage() + .get_invocation_status_for_epoch(&invocation_id, 0) + .await, + ok(all!( + matchers::storage::is_variant(InvocationStatusDiscriminants::Completed), + matchers::storage::is_epoch(0), + )) + ); + test_env + .verify_journal_components_for_epoch( + invocation_id, + 0, + [ + CommandType::Input.into(), + CommandType::ClearAllState.into(), + CommandType::ClearAllState.into(), + ], + ) + .await; + + test_env.shutdown().await; + } +} diff --git a/crates/worker/src/partition/state_machine/lifecycle/restart.rs b/crates/worker/src/partition/state_machine/lifecycle/restart.rs index 9e73e0ec76..53eae29afc 100644 --- a/crates/worker/src/partition/state_machine/lifecycle/restart.rs +++ b/crates/worker/src/partition/state_machine/lifecycle/restart.rs @@ -19,7 +19,7 @@ use restate_service_protocol_v4::entry_codec::ServiceProtocolV4Codec; use restate_storage_api::fsm_table::FsmTable; use restate_storage_api::invocation_status_table::{ CompletedInvocation, InFlightInvocationMetadata, InvocationStatus, InvocationStatusTable, - JournalRetentionPolicy, StatusTimestamps, + StatusTimestamps, }; use restate_storage_api::journal_table as journal_table_v1; use restate_storage_api::journal_table_v2::{JournalTable, ReadOnlyJournalTable}; @@ -145,7 +145,7 @@ where .await?; // Send abort invocation to invoker - ctx.do_send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); + ctx.send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); // Write output entry let response_result = ResponseResult::Failure(RESTARTED_INVOCATION_ERROR); @@ -188,14 +188,8 @@ where // Prepare the completed status let original_journal_length = metadata.journal_metadata.length; - let journal_retention_policy = if metadata.journal_retention_duration.is_zero() { - JournalRetentionPolicy::Drop - } else { - JournalRetentionPolicy::Retain - }; let completed_invocation = CompletedInvocation::from_in_flight_invocation_metadata( metadata, - journal_retention_policy, response_result, ); @@ -284,7 +278,7 @@ where invocation .completion_range_epoch_map - .add_trim_point(0, new_epoch); + .add_truncation_point(0, new_epoch); let in_flight_invocation_metadata = InFlightInvocationMetadata { invocation_target: invocation.invocation_target, diff --git a/crates/worker/src/partition/state_machine/mod.rs b/crates/worker/src/partition/state_machine/mod.rs index 9980225283..446c579dda 100644 --- a/crates/worker/src/partition/state_machine/mod.rs +++ b/crates/worker/src/partition/state_machine/mod.rs @@ -33,7 +33,7 @@ use restate_storage_api::idempotency_table::{IdempotencyTable, ReadOnlyIdempoten use restate_storage_api::inbox_table::{InboxEntry, InboxTable}; use restate_storage_api::invocation_status_table::{ CompletedInvocation, InFlightInvocationMetadata, InboxedInvocation, InvocationStatusTable, - JournalRetentionPolicy, PreFlightInvocationMetadata, ReadOnlyInvocationStatusTable, + PreFlightInvocationMetadata, ReadOnlyInvocationStatusTable, }; use restate_storage_api::invocation_status_table::{InvocationStatus, ScheduledInvocation}; use restate_storage_api::journal_table::ReadOnlyJournalTable; @@ -68,7 +68,7 @@ use restate_types::invocation::{ InvocationTargetType, InvocationTermination, JournalCompletionTarget, NotifySignalRequest, ResponseResult, RestateVersion, ServiceInvocation, ServiceInvocationResponseSink, ServiceInvocationSpanContext, Source, SubmitNotificationSink, TerminationFlavor, - VirtualObjectHandlerType, WorkflowHandlerType, + VirtualObjectHandlerType, WorkflowHandlerType, reset, }; use restate_types::invocation::{InvocationInput, SpanRelation}; use restate_types::journal::Completion; @@ -395,23 +395,6 @@ impl StateMachineApplyContext<'_, S> { }); } - fn send_abort_invocation_to_invoker( - &mut self, - invocation_id: InvocationId, - invocation_epoch: InvocationEpoch, - ) { - debug_if_leader!( - self.is_leader, - restate.invocation.id = %invocation_id, - "Send abort command to invoker" - ); - - self.action_collector.push(Action::AbortInvocation { - invocation_id, - invocation_epoch, - }); - } - async fn on_apply(&mut self, command: Command) -> Result<(), Error> where S: IdempotencyTable @@ -535,6 +518,26 @@ impl StateMachineApplyContext<'_, S> { .await?; Ok(()) } + Command::ResetInvocation(trim_invocation_request) => { + let status = self + .get_invocation_status(&trim_invocation_request.invocation_id) + .await?; + + let reset::TruncateFrom::EntryIndex { entry_index } = + trim_invocation_request.truncate_from; + lifecycle::OnResetInvocationCommand { + invocation_id: trim_invocation_request.invocation_id, + invocation_status: status, + truncation_point_entry_index: entry_index, + previous_attempt_retention: trim_invocation_request.previous_attempt_retention, + apply_to_child_calls: trim_invocation_request.apply_to_child_calls, + apply_to_pinned_deployment: trim_invocation_request.apply_to_pinned_deployment, + response_sink: trim_invocation_request.response_sink, + } + .apply(self) + .await?; + Ok(()) + } } } @@ -1120,7 +1123,7 @@ impl StateMachineApplyContext<'_, S> { // This can happen because the invoke/resume and the abort invoker messages end up in different queues, // and the abort message can overtake the invoke/resume. // Consequently the invoker might have not received the abort and the user tried to send it again. - self.do_send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); + self.send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); self.reply_to_kill(response_sink, KillInvocationResponse::NotFound); } }; @@ -1262,7 +1265,7 @@ impl StateMachineApplyContext<'_, S> { // This can happen because the invoke/resume and the abort invoker messages end up in different queues, // and the abort message can overtake the invoke/resume. // Consequently the invoker might have not received the abort and the user tried to send it again. - self.do_send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); + self.send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); self.reply_to_cancel(response_sink, CancelInvocationResponse::NotFound); } }; @@ -1409,7 +1412,7 @@ impl StateMachineApplyContext<'_, S> { Some(ResponseResult::Failure(KILLED_INVOCATION_ERROR)), ) .await?; - self.do_send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); + self.send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); Ok(()) } @@ -1438,7 +1441,7 @@ impl StateMachineApplyContext<'_, S> { Some(ResponseResult::Failure(KILLED_INVOCATION_ERROR)), ) .await?; - self.do_send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); + self.send_abort_invocation_to_invoker(invocation_id, InvocationEpoch::MAX); Ok(()) } @@ -1802,7 +1805,7 @@ impl StateMachineApplyContext<'_, S> { trace!( "Received invoker effect for invocation not in invoked status. Ignoring the effect." ); - self.do_send_abort_invocation_to_invoker(invocation_id, effect_invocation_epoch); + self.send_abort_invocation_to_invoker(invocation_id, effect_invocation_epoch); return Ok(()); } @@ -1815,7 +1818,7 @@ impl StateMachineApplyContext<'_, S> { "Received invoker effect for invocation with different epoch. Current epoch {} != Invoker effect epoch {}. Ignoring the effect.", current_invocation_epoch, effect_invocation_epoch ); - self.do_send_abort_invocation_to_invoker(invocation_id, effect_invocation_epoch); + self.send_abort_invocation_to_invoker(invocation_id, effect_invocation_epoch); return Ok(()); } @@ -2017,11 +2020,6 @@ impl StateMachineApplyContext<'_, S> { if !completion_retention.is_zero() { let completed_invocation = CompletedInvocation::from_in_flight_invocation_metadata( invocation_metadata, - if journal_retention.is_zero() { - JournalRetentionPolicy::Drop - } else { - JournalRetentionPolicy::Retain - }, response_result, ); self.do_store_completed_invocation(invocation_id, completed_invocation) @@ -4042,7 +4040,7 @@ impl StateMachineApplyContext<'_, S> { Ok(()) } - fn do_send_abort_invocation_to_invoker( + fn send_abort_invocation_to_invoker( &mut self, invocation_id: InvocationId, invocation_epoch: InvocationEpoch, diff --git a/crates/worker/src/partition/state_machine/tests/invocation_epoch_awareness.rs b/crates/worker/src/partition/state_machine/tests/invocation_epoch_awareness.rs index 43e4f0f5f6..558e182d8e 100644 --- a/crates/worker/src/partition/state_machine/tests/invocation_epoch_awareness.rs +++ b/crates/worker/src/partition/state_machine/tests/invocation_epoch_awareness.rs @@ -25,7 +25,7 @@ async fn fence_old_calls_and_completions() { .modify_invocation_status(invocation_id, |is| { let im = is.get_invocation_metadata_mut().unwrap(); im.current_invocation_epoch += 1; - im.completion_range_epoch_map.add_trim_point(2, 1); + im.completion_range_epoch_map.add_truncation_point(2, 1); }) .await; @@ -186,7 +186,7 @@ async fn fence_old_sleep_and_completions() { .modify_invocation_status(invocation_id, |is| { let im = is.get_invocation_metadata_mut().unwrap(); im.current_invocation_epoch += 1; - im.completion_range_epoch_map.add_trim_point(1, 1); + im.completion_range_epoch_map.add_truncation_point(1, 1); }) .await; diff --git a/crates/worker/src/partition/state_machine/tests/matchers.rs b/crates/worker/src/partition/state_machine/tests/matchers.rs index 61aba1ca5a..7cc16e2747 100644 --- a/crates/worker/src/partition/state_machine/tests/matchers.rs +++ b/crates/worker/src/partition/state_machine/tests/matchers.rs @@ -20,11 +20,12 @@ use restate_types::journal_v2::{Entry, EntryIndex}; pub mod storage { use super::*; - use restate_service_protocol::codec::ProtobufRawEntryCodec; + use restate_service_protocol::codec::ProtobufRawEntryCodec; use restate_storage_api::inbox_table::{InboxEntry, SequenceNumberInboxEntry}; use restate_storage_api::invocation_status_table::{ - InvocationStatus, InvocationStatusDiscriminants, + InFlightInvocationMetadata, InvocationStatus, InvocationStatusDiscriminants, + JournalMetadata, }; use restate_storage_api::journal_table::JournalEntry; use restate_types::identifiers::InvocationId; @@ -34,24 +35,37 @@ pub mod storage { pub fn has_journal_length( journal_length: EntryIndex, ) -> impl Matcher { - predicate(move |is: &InvocationStatus| { - is.get_journal_metadata() - .is_some_and(|jm| jm.length == journal_length) - }) - .with_description( - format!("has journal length {}", journal_length), - format!("hasn't journal length {}", journal_length), + // Guilty! + property_matcher::internal::property_matcher( + |o: &InvocationStatus| o.get_journal_metadata().map(|jm| jm.length), + "get_journal_metadata()", + some(eq(journal_length)), ) } + pub fn in_flight_meta( + inner: impl Matcher + 'static, + ) -> impl Matcher { + // Guilty! + property_matcher::internal::property_matcher( + |o: &InvocationStatus| o.get_invocation_metadata().cloned(), + "get_invocation_metadata()", + some(inner), + ) + } + + pub fn status( + status: InvocationStatusDiscriminants, + ) -> impl Matcher { + property!(InvocationStatus.discriminant(), some(eq(status))) + } + pub fn has_commands(commands: EntryIndex) -> impl Matcher { - predicate(move |is: &InvocationStatus| { - is.get_journal_metadata() - .is_some_and(|jm| jm.commands == commands) - }) - .with_description( - format!("has commands {}", commands), - format!("hasn't commands {}", commands), + // Guilty! + property_matcher::internal::property_matcher( + |o: &InvocationStatus| o.get_journal_metadata().cloned(), + "get_journal_metadata()", + some(field!(JournalMetadata.commands, eq(commands))), ) } @@ -101,7 +115,7 @@ pub mod actions { use restate_types::invocation::client::{ CancelInvocationResponse, KillInvocationResponse, PurgeInvocationResponse, }; - use restate_types::invocation::{InvocationTarget, ResponseResult}; + use restate_types::invocation::{InvocationEpoch, InvocationTarget, ResponseResult}; use restate_types::journal_v2::{Notification, Signal}; pub fn invoke_for_id(invocation_id: InvocationId) -> impl Matcher { @@ -120,6 +134,16 @@ pub mod actions { }) } + pub fn invoke_for_id_and_epoch( + invocation_id: InvocationId, + invocation_epoch: InvocationEpoch, + ) -> impl Matcher { + pat!(Action::Invoke { + invocation_id: eq(invocation_id), + invocation_epoch: eq(invocation_epoch) + }) + } + pub fn delete_sleep_timer(entry_index: EntryIndex) -> impl Matcher { pat!(Action::DeleteTimer { timer_key: pat!(TimerKey { diff --git a/tools/xtask/src/main.rs b/tools/xtask/src/main.rs index 438b1a3e66..6369e646ab 100644 --- a/tools/xtask/src/main.rs +++ b/tools/xtask/src/main.rs @@ -28,7 +28,11 @@ use restate_types::identifiers::{InvocationId, PartitionProcessorRpcRequestId, S use restate_types::invocation::client::{ AttachInvocationResponse, CancelInvocationResponse, GetInvocationOutputResponse, InvocationClient, InvocationClientError, InvocationOutput, KillInvocationResponse, - PurgeInvocationResponse, RestartInvocationResponse, SubmittedInvocationNotification, + PurgeInvocationResponse, ResetInvocationResponse, RestartInvocationResponse, + SubmittedInvocationNotification, +}; +use restate_types::invocation::reset::{ + ApplyToChildInvocations, ApplyToPinnedDeployment, TruncateFrom, }; use restate_types::invocation::restart::{ApplyToWorkflowRun, IfRunning}; use restate_types::invocation::{ @@ -199,6 +203,18 @@ impl InvocationClient for Mock { ) -> impl Future> + Send { pending() } + + fn reset_invocation( + &self, + _: PartitionProcessorRpcRequestId, + _: InvocationId, + _: TruncateFrom, + _: Option, + _: ApplyToChildInvocations, + _: ApplyToPinnedDeployment, + ) -> impl Future> + Send { + pending() + } } async fn generate_rest_api_doc() -> anyhow::Result<()> {