From e8079456caa3a75b70d5c2e026024a0fb13a7752 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 5 Mar 2026 14:11:11 -0800 Subject: [PATCH 1/2] clean up kv_reroder status --- onnxruntime/core/providers/openvino/ov_interface.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 3083b3199b2ad..e4f805ff0fcd4 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -615,7 +615,12 @@ void StatefulOVInferRequest::RewindKVCache(size_t index) { if (index == 0) { // In this case, since we're resetting the entire KVCache, simply reset the state. ovInfReq.reset_state(); + if (is_kvcache_reorder_added) { + kv_src_indices.clear(); + kv_dst_indices.clear(); + } } else { + // TODO for is_kvcache_reorder_added: do inference once to make sure the KV cache state is updated with the latest generated token before we the KV cache // Retrieve KVCache states and trim them to the specified index. // The following logic is adapted from: // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 From 74efe02a8131f76a56aff31f81556df476f76445 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 5 Mar 2026 14:13:30 -0800 Subject: [PATCH 2/2] refactor reorderKVCache code --- .../core/providers/openvino/backend_manager.cc | 4 ++-- .../core/providers/openvino/backend_manager.h | 2 +- .../providers/openvino/backends/basic_backend.cc | 4 ++-- .../providers/openvino/backends/basic_backend.h | 2 +- onnxruntime/core/providers/openvino/ibackend.h | 2 +- .../openvino/openvino_execution_provider.cc | 2 +- .../core/providers/openvino/ov_interface.cc | 15 ++++++++------- .../core/providers/openvino/ov_interface.h | 5 +++-- 8 files changed, 19 insertions(+), 17 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index d98311b617027..fb2ab32158d3a 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -779,9 +779,9 @@ void BackendManager::RewindKVCache(size_t index) { } } -void BackendManager::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { +void BackendManager::SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices) { if (concrete_backend_) { - concrete_backend_->ReorderKVCache(src_indices, dst_indices); + concrete_backend_->SetReorderKVCacheStatus(src_indices, dst_indices); } } diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index f8a74b9cbcfa4..1de86b663d868 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -31,7 +31,7 @@ class BackendManager { void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data); ov::CompiledModel GetOVCompiledModel(); void RewindKVCache(size_t index); - void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices); + void SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices); private: std::unique_ptr GetModelProtoFromFusedNode( diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index c3f657f151f45..84ee1e34aa26c 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -311,9 +311,9 @@ void BasicBackend::RewindKVCache(size_t index) { }); } -void BasicBackend::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { +void BasicBackend::SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices) { infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) { - infer_request->ReorderKVCache(src_indices, dst_indices); + infer_request->SetReorderKVCacheStatus(src_indices, dst_indices); }); } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index d8af2ce7fd595..dc8ad3c55f323 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -151,7 +151,7 @@ class BasicBackend : public IBackend { return exe_network_.Get(); } void RewindKVCache(size_t index) override; - void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; + void SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices) override; private: bool ValidateSubgraph(std::map>& const_outputs_map); diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 4444f37ac7433..8ef746a9b8cac 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -18,7 +18,7 @@ class IBackend { virtual ov::CompiledModel GetOVCompiledModel() = 0; virtual ~IBackend() = default; virtual void RewindKVCache(size_t index) {} - virtual void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) {} + virtual void SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices) {} }; using ptr_stream_t = std::unique_ptr; class BackendFactory { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index e2d14b9e761b6..a8fa61cf66d47 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -343,7 +343,7 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span>(src_indices), std::get>(dst_indices)); + backend.SetReorderKVCacheStatus(std::get>(src_indices), std::get>(dst_indices)); } } else { // Handle unknown options diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index e4f805ff0fcd4..c7abde13a92d2 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -573,23 +573,27 @@ void StatefulOVInferRequest::Infer() { } void StatefulOVInferRequest::PostProcessInferRequest() { + CleanupReorderStatus(); +} + +void StatefulOVInferRequest::CleanupReorderStatus() { if (is_kvcache_reorder_added) { kv_src_indices.clear(); kv_dst_indices.clear(); } } -void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { +void StatefulOVInferRequest::SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices) { // Validate input parameters if (src_indices.size() != dst_indices.size()) { ORT_THROW(log_tag + - "ReorderKVCache: src_indices and dst_indices must have the same size. " + "SetReorderKVCacheStatus: src_indices and dst_indices must have the same size. " "Got src_indices.size()=" + std::to_string(src_indices.size()) + ", dst_indices.size()=" + std::to_string(dst_indices.size())); } - LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " + LOGS_DEFAULT(INFO) << log_tag << "SetReorderKVCacheStatus: Reordering OpenVINO-internal KVCache state with " << src_indices.size() << " index pairs"; kv_src_indices = src_indices; @@ -615,10 +619,7 @@ void StatefulOVInferRequest::RewindKVCache(size_t index) { if (index == 0) { // In this case, since we're resetting the entire KVCache, simply reset the state. ovInfReq.reset_state(); - if (is_kvcache_reorder_added) { - kv_src_indices.clear(); - kv_dst_indices.clear(); - } + CleanupReorderStatus(); } else { // TODO for is_kvcache_reorder_added: do inference once to make sure the KV cache state is updated with the latest generated token before we the KV cache // Retrieve KVCache states and trim them to the specified index. diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 20693e3349ba8..6862860a71f0f 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -137,7 +137,7 @@ class OVInferRequest { return ovInfReq; } virtual void RewindKVCache([[maybe_unused]] size_t index) {} - virtual void ReorderKVCache([[maybe_unused]] const std::vector& src_indices, [[maybe_unused]] const std::vector& dst_indices) {} + virtual void SetReorderKVCacheStatus([[maybe_unused]] const std::vector& src_indices, [[maybe_unused]] const std::vector& dst_indices) {} }; class StatefulOVInferRequest : public OVInferRequest { @@ -146,7 +146,7 @@ class StatefulOVInferRequest : public OVInferRequest { void Infer() override; void RewindKVCache(size_t index) override; - void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; + void SetReorderKVCacheStatus(const std::vector& src_indices, const std::vector& dst_indices) override; void FillTensor(const std::string& tensor_name, const ov::element::Type& type, const std::vector& shape, int32_t fill_value); void CacheTensor(const std::string& tensor_name, std::vector& cache); @@ -157,6 +157,7 @@ class StatefulOVInferRequest : public OVInferRequest { private: void PreProcessInferRequest(); void PostProcessInferRequest(); + void CleanupReorderStatus(); std::string target_device; std::vector cached_input_ids;