Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -795,9 +795,9 @@
}
}

void BackendManager::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
void BackendManager::SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {

Check warning on line 798 in onnxruntime/core/providers/openvino/backend_manager.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/openvino/backend_manager.cc:798: Add #include <vector> for vector<> [build/include_what_you_use] [4]
if (concrete_backend_) {
concrete_backend_->ReorderKVCache(src_indices, dst_indices);
concrete_backend_->SetReorderKVCacheStatus(src_indices, dst_indices);
}
}

Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/openvino/backend_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class BackendManager {
void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data);
ov::CompiledModel GetOVCompiledModel();
void RewindKVCache(size_t index);
void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices);
void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices);

private:
std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,9 @@
});
}

void BasicBackend::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
void BasicBackend::SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {

Check warning on line 314 in onnxruntime/core/providers/openvino/backends/basic_backend.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/openvino/backends/basic_backend.cc:314: Add #include <vector> for vector<> [build/include_what_you_use] [4]
infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) {
infer_request->ReorderKVCache(src_indices, dst_indices);
infer_request->SetReorderKVCacheStatus(src_indices, dst_indices);
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class BasicBackend : public IBackend {
return exe_network_.Get();
}
void RewindKVCache(size_t index) override;
void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;

private:
bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/openvino/ibackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
virtual ov::CompiledModel GetOVCompiledModel() = 0;
virtual ~IBackend() = default;
virtual void RewindKVCache(size_t index) {}
virtual void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {}
virtual void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {}

Check warning on line 21 in onnxruntime/core/providers/openvino/ibackend.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/openvino/ibackend.h:21: Add #include <vector> for vector<> [build/include_what_you_use] [4]
};
using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
class BackendFactory {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch

// Trigger KVCache Reorder for target Backend with vector arguments
for (auto& backend : backend_managers_) {
backend.ReorderKVCache(std::get<std::vector<int32_t>>(src_indices), std::get<std::vector<int32_t>>(dst_indices));
backend.SetReorderKVCacheStatus(std::get<std::vector<int32_t>>(src_indices), std::get<std::vector<int32_t>>(dst_indices));
}
} else {
// Handle unknown options
Expand Down
12 changes: 9 additions & 3 deletions onnxruntime/core/providers/openvino/ov_interface.cc
Original file line number Diff line number Diff line change
Expand Up @@ -573,23 +573,27 @@
}

void StatefulOVInferRequest::PostProcessInferRequest() {
CleanupReorderStatus();
}

void StatefulOVInferRequest::CleanupReorderStatus() {
if (is_kvcache_reorder_added) {
kv_src_indices.clear();
kv_dst_indices.clear();
}
}

void StatefulOVInferRequest::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
void StatefulOVInferRequest::SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {

Check warning on line 586 in onnxruntime/core/providers/openvino/ov_interface.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/openvino/ov_interface.cc:586: Add #include <vector> for vector<> [build/include_what_you_use] [4]
// Validate input parameters
if (src_indices.size() != dst_indices.size()) {
ORT_THROW(log_tag +
"ReorderKVCache: src_indices and dst_indices must have the same size. "
"SetReorderKVCacheStatus: src_indices and dst_indices must have the same size. "
"Got src_indices.size()=" +
std::to_string(src_indices.size()) +
", dst_indices.size()=" + std::to_string(dst_indices.size()));
}

LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with "
LOGS_DEFAULT(INFO) << log_tag << "SetReorderKVCacheStatus: Reordering OpenVINO-internal KVCache state with "
<< src_indices.size() << " index pairs";

kv_src_indices = src_indices;
Expand All @@ -615,7 +619,9 @@
if (index == 0) {
// In this case, since we're resetting the entire KVCache, simply reset the state.
ovInfReq.reset_state();
CleanupReorderStatus();
} else {
// TODO for is_kvcache_reorder_added: do inference once to make sure the KV cache state is updated with the latest generated token before we the KV cache

Check warning on line 624 in onnxruntime/core/providers/openvino/ov_interface.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2] Raw Output: onnxruntime/core/providers/openvino/ov_interface.cc:624: Missing username in TODO; it should look like "// TODO(my_username): Stuff." [readability/todo] [2]
// Retrieve KVCache states and trim them to the specified index.
// The following logic is adapted from:
// https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/providers/openvino/ov_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class OVInferRequest {
return ovInfReq;
}
virtual void RewindKVCache([[maybe_unused]] size_t index) {}
virtual void ReorderKVCache([[maybe_unused]] const std::vector<int32_t>& src_indices, [[maybe_unused]] const std::vector<int32_t>& dst_indices) {}
virtual void SetReorderKVCacheStatus([[maybe_unused]] const std::vector<int32_t>& src_indices, [[maybe_unused]] const std::vector<int32_t>& dst_indices) {}
};

class StatefulOVInferRequest : public OVInferRequest {
Expand All @@ -146,7 +146,7 @@ class StatefulOVInferRequest : public OVInferRequest {

void Infer() override;
void RewindKVCache(size_t index) override;
void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
void SetReorderKVCacheStatus(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
void FillTensor(const std::string& tensor_name, const ov::element::Type& type,
const std::vector<size_t>& shape, int32_t fill_value);
void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);
Expand All @@ -157,6 +157,7 @@ class StatefulOVInferRequest : public OVInferRequest {
private:
void PreProcessInferRequest();
void PostProcessInferRequest();
void CleanupReorderStatus();
std::string target_device;

std::vector<int64_t> cached_input_ids;
Expand Down
Loading