feat: auto-compact and retry on context window errors

Adds automatic compaction and retry when context window is exceeded.
This commit is contained in:
TheArchitectit
2026-05-24 21:24:44 -05:00
committed by GitHub
parent f1a55a211e
commit b43a6f2d29
4 changed files with 917 additions and 5 deletions

View File

@@ -3843,12 +3843,13 @@ fn run_resume_command(
json: Some(serde_json::json!({ "kind": "help", "text": render_repl_help() })),
}),
SlashCommand::Compact => {
let result = runtime::compact_session(
let result = runtime::trident::trident_compact_session(
session,
CompactionConfig {
max_estimated_tokens: 0,
..CompactionConfig::default()
},
&runtime::trident::TridentConfig::default(),
);
let removed = result.removed_message_count;
let kept = result.compacted_session.messages.len();
@@ -5046,6 +5047,120 @@ impl LiveCli {
TerminalRenderer::new().color_theme(),
&mut stdout,
)?;
// ============================================================================
// Auto-compact retry on context window errors
// ============================================================================
// When the model API returns a context_window_blocked error (because the request
// exceeds the model's context window), we automatically:
// 1. Compact the session (remove old messages to free up space)
// 2. Retry the original request with the compacted session
// 3. Report results to the user
//
// This eliminates the need for users to manually run /compact when they
// hit context limits - the recovery happens automatically.
//
// Detection: We look for "context_window" or "Context window" in the error
// message, which covers error types like:
// - "context_window_blocked"
// - "Context window blocked"
// - "This model's maximum context length is X tokens..."
// ============================================================================
let error_str = error.to_string();
// Detect context window overflow. Some providers (e.g. OpenAI-compat backends)
// return 400 with "no parseable body" instead of a proper context_length_exceeded
// error when the request is too large to even parse — treat that as context overflow too.
let is_context_window = error_str.contains("context_window")
|| error_str.contains("Context window")
|| error_str.contains("no parseable body");
if is_context_window {
// A single compaction pass may not free enough context space.
// Progressive retry: each round preserves fewer recent messages (4→2→1→0),
// trading conversation continuity for a smaller payload until it fits.
// Max 4 rounds before giving up and surfacing the error to the user.
let max_compact_rounds = 4;
let preserve_schedule = [4, 2, 1, 0];
for round in 0..max_compact_rounds {
let preserve = preserve_schedule[round];
println!(
" Auto-compacting session (round {}/{}, preserving {} recent messages)...",
round + 1,
max_compact_rounds,
preserve
);
// Run Trident pipeline then summary-based compaction
let result = runtime::trident::trident_compact_session(
runtime.session(),
CompactionConfig {
preserve_recent_messages: preserve,
max_estimated_tokens: 0,
},
&runtime::trident::TridentConfig::default(),
);
let removed = result.removed_message_count;
if removed == 0 && round > 0 {
// No more messages to compact — further rounds won't help
println!(" No further compaction possible.");
break;
}
if removed > 0 {
println!("{}", format_compact_report(removed, result.compacted_session.messages.len(), false));
}
// Without this, prepare_turn_runtime() reads from self.runtime.session()
// which still holds the ORIGINAL un-compacted session, so every retry round
// would send the same bloated request — compaction was wasted.
*self.runtime.session_mut() = result.compacted_session.clone();
// Build a new runtime with the compacted session and retry
let (mut new_runtime, hook_abort_monitor) = self.prepare_turn_runtime(true)?;
drop(hook_abort_monitor);
let mut rp = CliPermissionPrompter::new(self.permission_mode);
match new_runtime.run_turn(input, Some(&mut rp)) {
Ok(summary) => {
self.replace_runtime(new_runtime)?;
spinner.finish(
if round == 0 { "✨ Done (after auto-compact)" } else { "✨ Done (after aggressive auto-compact)" },
TerminalRenderer::new().color_theme(),
&mut stdout,
)?;
println!();
if let Some(event) = summary.auto_compaction {
println!("{}", format_auto_compaction_notice(event.removed_message_count));
}
self.persist_session()?;
return Ok(());
}
Err(retry_error) => {
let retry_str = retry_error.to_string();
let still_context_window = retry_str.contains("context_window")
|| retry_str.contains("Context window")
|| retry_str.contains("no parseable body");
if still_context_window && round + 1 < max_compact_rounds {
// The compacted session was still too large for the model's context.
// Shut down the old runtime, adopt the partially-compacted one,
// and loop — the next round will compact more aggressively.
runtime.shutdown_plugins()?;
runtime = new_runtime;
continue;
}
// Not a context window error, or out of rounds
return Err(Box::new(retry_error));
}
}
}
}
// If not a context window error, return original error
Err(Box::new(error))
}
}