intel · Kotomi-Du · Mar 5, 2026 · Mar 5, 2026 · Mar 18, 2026
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -795,9 +795,9 @@
   }
 }
 
-void BackendManager::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
+void BackendManager::SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
   if (concrete_backend_) {
-    concrete_backend_->ReorderKVCache(src_indices, dst_indices);
+    concrete_backend_->SetReorderKVCacheStatus(src_indices, dst_indices);
   }
 }
 

diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -31,7 +31,7 @@ class BackendManager {
   void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data);
   ov::CompiledModel GetOVCompiledModel();
   void RewindKVCache(size_t index);
-  void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices);
+  void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices);
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -311,9 +311,9 @@
   });
 }
 
-void BasicBackend::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
+void BasicBackend::SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
   infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) {
-    infer_request->ReorderKVCache(src_indices, dst_indices);
+    infer_request->SetReorderKVCacheStatus(src_indices, dst_indices);
   });
 }
 

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -151,7 +151,7 @@ class BasicBackend : public IBackend {
     return exe_network_.Get();
   }
   void RewindKVCache(size_t index) override;
-  void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
+  void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
 
  private:
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);

diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
@@ -18,7 +18,7 @@
   virtual ov::CompiledModel GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
   virtual void RewindKVCache(size_t index) {}
-  virtual void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {}
+  virtual void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {}
 };
 using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
 class BackendFactory {

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -343,7 +343,7 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
 
       // Trigger KVCache Reorder for target Backend with vector arguments
       for (auto& backend : backend_managers_) {
-        backend.ReorderKVCache(std::get<std::vector<int32_t>>(src_indices), std::get<std::vector<int32_t>>(dst_indices));
+        backend.SetReorderKVCacheStatus(std::get<std::vector<int32_t>>(src_indices), std::get<std::vector<int32_t>>(dst_indices));
       }
     } else {
       // Handle unknown options

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -573,23 +573,27 @@
 }
 
 void StatefulOVInferRequest::PostProcessInferRequest() {
+  CleanupReorderStatus();
+}
+
+void StatefulOVInferRequest::CleanupReorderStatus() {
   if (is_kvcache_reorder_added) {
     kv_src_indices.clear();
     kv_dst_indices.clear();
   }
 }
 
-void StatefulOVInferRequest::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
+void StatefulOVInferRequest::SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
   // Validate input parameters
   if (src_indices.size() != dst_indices.size()) {
     ORT_THROW(log_tag +
-              "ReorderKVCache: src_indices and dst_indices must have the same size. "
+              "SetReorderKVCacheStatus: src_indices and dst_indices must have the same size. "
               "Got src_indices.size()=" +
               std::to_string(src_indices.size()) +
               ", dst_indices.size()=" + std::to_string(dst_indices.size()));
   }
 
-  LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with "
+  LOGS_DEFAULT(INFO) << log_tag << "SetReorderKVCacheStatus: Reordering OpenVINO-internal KVCache state with "
                      << src_indices.size() << " index pairs";
 
   kv_src_indices = src_indices;
@@ -615,7 +619,9 @@
     if (index == 0) {
       // In this case, since we're resetting the entire KVCache, simply reset the state.
       ovInfReq.reset_state();
+      CleanupReorderStatus();
     } else {
+      // TODO for is_kvcache_reorder_added: do inference once to make sure the KV cache state is updated with the latest generated token before we the KV cache
       // Retrieve KVCache states and trim them to the specified index.
       // The following logic is adapted from:
       // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329

diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -137,7 +137,7 @@ class OVInferRequest {
     return ovInfReq;
   }
   virtual void RewindKVCache([[maybe_unused]] size_t index) {}
-  virtual void ReorderKVCache([[maybe_unused]] const std::vector<int32_t>& src_indices, [[maybe_unused]] const std::vector<int32_t>& dst_indices) {}
+  virtual void SetReorderKVCacheStatus([[maybe_unused]] const std::vector<int32_t>& src_indices, [[maybe_unused]] const std::vector<int32_t>& dst_indices) {}
 };
 
 class StatefulOVInferRequest : public OVInferRequest {
@@ -146,7 +146,7 @@ class StatefulOVInferRequest : public OVInferRequest {
 
   void Infer() override;
   void RewindKVCache(size_t index) override;
-  void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
+  void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
   void FillTensor(const std::string& tensor_name, const ov::element::Type& type,
                   const std::vector<size_t>& shape, int32_t fill_value);
   void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);
@@ -157,6 +157,7 @@ class StatefulOVInferRequest : public OVInferRequest {
  private:
   void PreProcessInferRequest();
   void PostProcessInferRequest();
+  void CleanupReorderStatus();
   std::string target_device;
 
   std::vector<int64_t> cached_input_ids;