From 44217f7b2acca91ff933cd0683e82427dd6a78de Mon Sep 17 00:00:00 2001 From: licorn Date: Tue, 24 Mar 2026 21:55:51 +0100 Subject: [PATCH 1/2] fix(router): use max_completion_tokens for OpenAI GPT-5+ validation probe OpenAI GPT-5 models reject the legacy max_tokens parameter and require max_completion_tokens. The inference validation probe now sends max_completion_tokens as the primary parameter, with an automatic fallback to max_tokens when the backend returns HTTP 400 (for legacy/self-hosted backends that only support the older parameter). Closes #517 Signed-off-by: Maxime Grenu --- crates/openshell-router/src/backend.rs | 158 ++++++++++++++++++++++- crates/openshell-server/src/inference.rs | 2 +- 2 files changed, 154 insertions(+), 6 deletions(-) diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs index 0708cfb0..d07e43ae 100644 --- a/crates/openshell-router/src/backend.rs +++ b/crates/openshell-router/src/backend.rs @@ -31,6 +31,11 @@ struct ValidationProbe { path: &'static str, protocol: &'static str, body: bytes::Bytes, + /// Alternate body to try when the primary probe fails with HTTP 400. + /// Used for OpenAI chat completions where newer models require + /// `max_completion_tokens` while legacy/self-hosted backends only + /// accept `max_tokens`. + fallback_body: Option, } /// Response from a proxied HTTP request to a backend (fully buffered). @@ -163,12 +168,17 @@ fn validation_probe(route: &ResolvedRoute) -> Result Result Result Result, + body: bytes::Bytes, +) -> Result { + let response = send_backend_request(client, route, "POST", path, headers, body) .await .map_err(|err| match err { RouterError::UpstreamUnavailable(details) => ValidationFailure { @@ -253,12 +306,12 @@ pub async fn verify_backend_endpoint( details, }, })?; - let url = build_backend_url(&route.endpoint, probe.path); + let url = build_backend_url(&route.endpoint, path); if response.status().is_success() { return Ok(ValidatedEndpoint { url, - protocol: probe.protocol.to_string(), + protocol: protocol.to_string(), }); } @@ -376,7 +429,7 @@ fn build_backend_url(endpoint: &str, path: &str) -> String { #[cfg(test)] mod tests { - use super::{build_backend_url, verify_backend_endpoint}; + use super::{build_backend_url, verify_backend_endpoint, ValidationFailureKind}; use crate::config::ResolvedRoute; use openshell_core::inference::AuthHeader; use wiremock::matchers::{body_partial_json, header, method, path}; @@ -463,4 +516,99 @@ mod tests { assert_eq!(validated.protocol, "openai_chat_completions"); assert_eq!(validated.url, "mock://test-backend/v1/chat/completions"); } + + /// GPT-5+ models reject `max_tokens` — the primary probe uses + /// `max_completion_tokens` so validation should succeed directly. + #[tokio::test] + async fn verify_openai_chat_uses_max_completion_tokens() { + let mock_server = MockServer::start().await; + let route = test_route( + &mock_server.uri(), + &["openai_chat_completions"], + AuthHeader::Bearer, + ); + + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .and(body_partial_json(serde_json::json!({ + "max_completion_tokens": 32, + }))) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "chatcmpl-1"})), + ) + .mount(&mock_server) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let validated = verify_backend_endpoint(&client, &route).await.unwrap(); + + assert_eq!(validated.protocol, "openai_chat_completions"); + } + + /// Legacy/self-hosted backends that reject `max_completion_tokens` + /// should succeed on the fallback probe using `max_tokens`. + #[tokio::test] + async fn verify_openai_chat_falls_back_to_max_tokens() { + let mock_server = MockServer::start().await; + let route = test_route( + &mock_server.uri(), + &["openai_chat_completions"], + AuthHeader::Bearer, + ); + + // Reject the primary probe (max_completion_tokens) with 400. + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .and(body_partial_json(serde_json::json!({ + "max_completion_tokens": 32, + }))) + .respond_with(ResponseTemplate::new(400).set_body_string( + r#"{"error":{"message":"Unsupported parameter: 'max_completion_tokens'"}}"#, + )) + .expect(1) + .mount(&mock_server) + .await; + + // Accept the fallback probe (max_tokens). + Mock::given(method("POST")) + .and(path("/v1/chat/completions")) + .and(body_partial_json(serde_json::json!({ + "max_tokens": 32, + }))) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({"id": "chatcmpl-2"})), + ) + .expect(1) + .mount(&mock_server) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let validated = verify_backend_endpoint(&client, &route).await.unwrap(); + + assert_eq!(validated.protocol, "openai_chat_completions"); + } + + /// Non-chat-completions probes (e.g. anthropic_messages) should not + /// have a fallback — a 400 remains a hard failure. + #[tokio::test] + async fn verify_non_chat_completions_no_fallback() { + let mock_server = MockServer::start().await; + let route = test_route( + &mock_server.uri(), + &["anthropic_messages"], + AuthHeader::Custom("x-api-key"), + ); + + Mock::given(method("POST")) + .and(path("/v1/messages")) + .respond_with(ResponseTemplate::new(400).set_body_string("bad request")) + .mount(&mock_server) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let result = verify_backend_endpoint(&client, &route).await; + + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind, ValidationFailureKind::RequestShape); + } } diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 78b95944..bbabaf70 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -854,7 +854,7 @@ mod tests { .and(header("content-type", "application/json")) .and(body_partial_json(serde_json::json!({ "model": "gpt-4o-mini", - "max_tokens": 32, + "max_completion_tokens": 32, }))) .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ "id": "chatcmpl-123", From d4f81c4bf687ee3ec2de605366a4648058b0e34f Mon Sep 17 00:00:00 2001 From: John Myers Date: Wed, 25 Mar 2026 16:01:59 -0700 Subject: [PATCH 2/2] style(router): fix cargo fmt import order and line length --- crates/openshell-router/src/backend.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs index d07e43ae..d82ea082 100644 --- a/crates/openshell-router/src/backend.rs +++ b/crates/openshell-router/src/backend.rs @@ -429,7 +429,7 @@ fn build_backend_url(endpoint: &str, path: &str) -> String { #[cfg(test)] mod tests { - use super::{build_backend_url, verify_backend_endpoint, ValidationFailureKind}; + use super::{ValidationFailureKind, build_backend_url, verify_backend_endpoint}; use crate::config::ResolvedRoute; use openshell_core::inference::AuthHeader; use wiremock::matchers::{body_partial_json, header, method, path}; @@ -609,6 +609,9 @@ mod tests { let result = verify_backend_endpoint(&client, &route).await; assert!(result.is_err()); - assert_eq!(result.unwrap_err().kind, ValidationFailureKind::RequestShape); + assert_eq!( + result.unwrap_err().kind, + ValidationFailureKind::RequestShape + ); } }