diff --git a/rust/crates/api/src/providers/openai_compat.rs b/rust/crates/api/src/providers/openai_compat.rs index f66710ab..ca028845 100644 --- a/rust/crates/api/src/providers/openai_compat.rs +++ b/rust/crates/api/src/providers/openai_compat.rs @@ -493,12 +493,7 @@ impl StreamState { } if let Some(usage) = chunk.usage { - self.usage = Some(Usage { - input_tokens: usage.prompt_tokens, - cache_creation_input_tokens: 0, - cache_read_input_tokens: 0, - output_tokens: usage.completion_tokens, - }); + self.usage = Some(usage.normalized()); } for choice in chunk.choices { @@ -776,6 +771,29 @@ struct OpenAiUsage { prompt_tokens: u32, #[serde(default)] completion_tokens: u32, + #[serde(default)] + prompt_tokens_details: Option, +} + +#[derive(Debug, Deserialize)] +struct OpenAiPromptTokensDetails { + #[serde(default)] + cached_tokens: u32, +} + +impl OpenAiUsage { + fn normalized(&self) -> Usage { + let cached_tokens = self + .prompt_tokens_details + .as_ref() + .map_or(0, |details| details.cached_tokens); + Usage { + input_tokens: self.prompt_tokens.saturating_sub(cached_tokens), + cache_creation_input_tokens: 0, + cache_read_input_tokens: cached_tokens, + output_tokens: self.completion_tokens, + } + } } #[derive(Debug, Deserialize)] @@ -1377,18 +1395,10 @@ fn normalize_response( .finish_reason .map(|value| normalize_finish_reason(&value)), stop_sequence: None, - usage: Usage { - input_tokens: response - .usage - .as_ref() - .map_or(0, |usage| usage.prompt_tokens), - cache_creation_input_tokens: 0, - cache_read_input_tokens: 0, - output_tokens: response - .usage - .as_ref() - .map_or(0, |usage| usage.completion_tokens), - }, + usage: response + .usage + .as_ref() + .map_or_else(Usage::default, OpenAiUsage::normalized), request_id: None, }) } diff --git a/rust/crates/api/tests/openai_compat_integration.rs b/rust/crates/api/tests/openai_compat_integration.rs index 5db9eaf1..0cbe8732 100644 --- a/rust/crates/api/tests/openai_compat_integration.rs +++ b/rust/crates/api/tests/openai_compat_integration.rs @@ -42,6 +42,9 @@ async fn send_message_uses_openai_compatible_endpoint_and_auth() { .expect("request should succeed"); assert_eq!(response.model, "grok-3"); + assert_eq!(response.usage.input_tokens, 8); + assert_eq!(response.usage.cache_read_input_tokens, 3); + assert_eq!(response.usage.output_tokens, 5); assert_eq!(response.total_tokens(), 16); assert_eq!( response.content, @@ -284,7 +287,7 @@ async fn openai_streaming_requests_opt_into_usage_chunks() { let sse = concat!( "data: {\"id\":\"chatcmpl_openai_stream\",\"model\":\"gpt-5\",\"choices\":[{\"delta\":{\"content\":\"Hi\"}}]}\n\n", "data: {\"id\":\"chatcmpl_openai_stream\",\"choices\":[{\"delta\":{},\"finish_reason\":\"stop\"}]}\n\n", - "data: {\"id\":\"chatcmpl_openai_stream\",\"choices\":[],\"usage\":{\"prompt_tokens\":9,\"completion_tokens\":4}}\n\n", + "data: {\"id\":\"chatcmpl_openai_stream\",\"choices\":[],\"usage\":{\"prompt_tokens\":9,\"completion_tokens\":4,\"prompt_tokens_details\":{\"cached_tokens\":2}}}\n\n", "data: [DONE]\n\n" ); let server = spawn_server( @@ -339,8 +342,10 @@ async fn openai_streaming_requests_opt_into_usage_chunks() { match &events[4] { StreamEvent::MessageDelta(MessageDeltaEvent { usage, .. }) => { - assert_eq!(usage.input_tokens, 9); + assert_eq!(usage.input_tokens, 7); + assert_eq!(usage.cache_read_input_tokens, 2); assert_eq!(usage.output_tokens, 4); + assert_eq!(usage.total_tokens(), 13); } other => panic!("expected message delta, got {other:?}"), }