fix: recover from llama.cpp context overflow and reqwest SSE decode failures

Extend auto-compaction error detection to handle additional error patterns from llama.cpp backends: 'Context size has been exceeded', 'exceed_context_size_error', 'exceeds the available context size'. Also recover from reqwest 'error decoding response body' errors — some llama.cpp instances return a non-SSE plaintext HTTP 500 on context overflow, causing the SSE deserializer to fail. Add dynamic threshold adaptation: parse server-reported context window size from error messages (e.g., '(81920 tokens)') and set the auto- compaction trigger at 70% of that value. This replaces the need for a hardcoded threshold, adapting automatically to any backend's limits. This patch was developed with assistance from OpenCode and local Qwen 3.6 API server.
2026-06-05 03:56:45 +00:00 · 2026-05-27 16:45:17 +02:00
parent 87b7e74770
commit 1d516be779
2 changed files with 104 additions and 4 deletions
--- a/rust/crates/runtime/src/conversation.rs
+++ b/rust/crates/runtime/src/conversation.rs
@@ -204,6 +204,13 @@ where
        self
    }

+    /// Update the auto-compaction threshold after construction. This allows the
+    /// caller to tune the threshold based on runtime information (e.g., the
+    /// server-returned context window size from a 400 error).
+    pub fn set_auto_compaction_input_tokens_threshold(&mut self, threshold: u32) {
+        self.auto_compaction_input_tokens_threshold = threshold;
+    }
+
    #[must_use]
    pub fn with_hook_abort_signal(mut self, hook_abort_signal: HookAbortSignal) -> Self {
        self.hook_abort_signal = hook_abort_signal;
--- a/rust/crates/rusty-claude-cli/src/main.rs
+++ b/rust/crates/rusty-claude-cli/src/main.rs
@@ -5659,11 +5659,36 @@ impl LiveCli {
                // Detect context window overflow. Some providers (e.g. OpenAI-compat backends)
                // return 400 with "no parseable body" instead of a proper context_length_exceeded
                // error when the request is too large to even parse — treat that as context overflow too.
+                // Also detect model-specific context error markers (e.g. llama.cpp returns
+                // "Context size has been exceeded." / "exceed_context_size_error" / "exceeds the available context size").
                let is_context_window = error_str.contains("context_window")
                    || error_str.contains("Context window")
-                    || error_str.contains("no parseable body");
+                    || error_str.contains("no parseable body")
+                    || error_str.contains("exceed_context_size")
+                    || error_str.contains("exceeds the available context size")
+                    || error_str.to_ascii_lowercase().contains("context size has been exceeded");
+
+                // Also treat "assistant stream produced no content" and reqwest decode failures
+                // as recoverable errors that may benefit from auto-compaction. Some backends (e.g.
+                // llama.cpp) return a non-SSE HTTP 500 body when context overflows, causing
+                // reqwest to fail with "error decoding response body" — treat that as context overflow too.
+                let is_no_content = error_str.contains("assistant stream produced no content")
+                    || error_str.contains("Failed to parse input at pos")
+                    || error_str.contains("error decoding response body");
+
+                if is_context_window || is_no_content {
+                    // If the error tells us the server's actual context window, adapt our
+                    // auto-compaction threshold so future auto-compact-trigger checks are accurate.
+                    if let Some(window) = extract_context_window_tokens_from_error(&error_str) {
+                        // Set threshold at 70% of the reported window to leave headroom.
+                        let threshold: u32 = (window as f64 * 0.7).round() as u32;
+                        println!(
+                            "  Server context window: {} tokens — setting auto-compaction threshold to {}",
+                            window, threshold
+                        );
+                        runtime.set_auto_compaction_input_tokens_threshold(threshold);
+                    }

-                if is_context_window {
                    // A single compaction pass may not free enough context space.
                    // Progressive retry: each round preserves fewer recent messages (4→2→1→0),
                    // trading conversation continuity for a smaller payload until it fits.
@@ -5745,9 +5770,21 @@ impl LiveCli {
                                let retry_str = retry_error.to_string();
                                let still_context_window = retry_str.contains("context_window")
                                    || retry_str.contains("Context window")
-                                    || retry_str.contains("no parseable body");
+                                    || retry_str.contains("no parseable body")
+                                    || retry_str.contains("exceed_context_size")
+                                    || retry_str.contains("exceeds the available context size")
+                                    || retry_str.to_ascii_lowercase().contains("context size has been exceeded");
+                                let still_no_content = retry_str.contains("assistant stream produced no content")
+                                    || retry_str.contains("Failed to parse input at pos")
+                                    || retry_str.contains("error decoding response body");
+
+                                if (still_context_window || still_no_content) && round + 1 < max_compact_rounds {
+                                    // If the retry error reveals the context window, adapt threshold.
+                                    if let Some(window) = extract_context_window_tokens_from_error(&retry_str) {
+                                        let threshold: u32 = (window as f64 * 0.7).round() as u32;
+                                        new_runtime.set_auto_compaction_input_tokens_threshold(threshold);
+                                    }

-                                if still_context_window && round + 1 < max_compact_rounds {
                                    // The compacted session was still too large for the model's context.
                                    // Shut down the old runtime, adopt the partially-compacted one,
                                    // and loop — the next round will compact more aggressively.
@@ -10054,6 +10091,62 @@ fn request_ends_with_tool_result(request: &ApiRequest) -> bool {
        .is_some_and(|message| message.role == MessageRole::Tool)
 }

+/// Extract the server-reported context window size from an error message.
+/// Returns `None` if no window size can be parsed.  The server must
+/// mention something like "context size (81920 tokens)" or "available
+/// context size (81920 tokens)" — the number inside parens after the
+/// parenthesised phrase is taken as the window.
+///
+/// Known formats:
+///   - "exceeds the available context size (81920 tokens)"
+///   - "context size (128000 tokens)"
+///   - "maximum context length is 200000 tokens"
+fn extract_context_window_tokens_from_error(error_str: &str) -> Option<u32> {
+    // Pattern: "(NNNNNN tokens)" appearing after context-size markers
+    for line in error_str.lines() {
+        let lowered = line.to_ascii_lowercase();
+        if lowered.contains("context size") || lowered.contains("context length")
+            || lowered.contains("context window")
+        {
+            // Try parenthesised form: (81920 tokens)
+            if let Some(start) = lowered.find('(') {
+                if let Some(end) = lowered.find(")") {
+                    if start < end {
+                        let inner = &line[start + 1..end];
+                        let digits: String = inner.chars().take_while(|c| c.is_ascii_digit()).collect();
+                        if let Ok(n) = digits.parse::<u32>() {
+                            if n > 1000 {
+                                return Some(n);
+                            }
+                        }
+                    }
+                }
+            }
+            // Try "maximum context length is NNNNNN tokens"
+            if let Some(pos) = lowered.find("is ") {
+                let rest = &line[pos + 3..];
+                let digits: String = rest.chars().take_while(|c| c.is_ascii_digit()).collect();
+                if let Ok(n) = digits.parse::<u32>() {
+                    if n > 1000 {
+                        return Some(n);
+                    }
+                }
+            }
+            // Try "configured limit of NNNNNN tokens"
+            if let Some(pos) = lowered.find("of ") {
+                let rest = &line[pos + 3..];
+                let digits: String = rest.chars().take_while(|c| c.is_ascii_digit()).collect();
+                if let Ok(n) = digits.parse::<u32>() {
+                    if n > 1000 {
+                        return Some(n);
+                    }
+                }
+            }
+        }
+    }
+    None
+}
+
 fn format_user_visible_api_error(session_id: &str, error: &api::ApiError) -> String {
    if error.is_context_window_failure() {
        format_context_window_blocked_error(session_id, error)