feat(llm): add resilience layer with rate limiting and retry

Add LLMSemaphore for concurrency control (default 4) and retryWithBackoff with exponential backoff respecting 429 retryAfterSeconds. Wrap all LLMGateway calls (chatForRole, chatDirect, embedForRole) via withResilience. New config fields: LLM_MAX_CONCURRENT_CALLS, LLM_RETRY_MAX_ATTEMPTS, LLM_RETRY_BASE_DELAY_MS, ENABLE_TRIAGE. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)
2026-03-27 10:05:50 +00:00 · 2026-03-05 22:02:03 +08:00
parent 9a356a228f
commit 839d4a89bf
5 changed files with 859 additions and 9 deletions
--- a/src/config/config-manager.ts
+++ b/src/config/config-manager.ts
@@ -37,6 +37,10 @@ export interface AppConfig {
    enableHumanGate: boolean;
    allowedCommands: string[];
    commandTimeoutMs: number;
+    llmMaxConcurrentCalls: number;
+    llmRetryMaxAttempts: number;
+    llmRetryBaseDelayMs: number;
+    enableTriage: boolean;
    qdrantUrl: string | undefined;
    enableMemory: boolean;
    fewShotExamplesCount: number;
@@ -153,6 +157,10 @@ class ConfigManager {
          'wc',
        ]),
        commandTimeoutMs: toNumber('REVIEW_COMMAND_TIMEOUT_MS', 10000),
+        llmMaxConcurrentCalls: toNumber('LLM_MAX_CONCURRENT_CALLS', 4),
+        llmRetryMaxAttempts: toNumber('LLM_RETRY_MAX_ATTEMPTS', 3),
+        llmRetryBaseDelayMs: toNumber('LLM_RETRY_BASE_DELAY_MS', 1000),
+        enableTriage: toBoolean('ENABLE_TRIAGE', true),
        qdrantUrl: values.QDRANT_URL,
        enableMemory: toBoolean('ENABLE_MEMORY', false),
        fewShotExamplesCount: toNumber('FEW_SHOT_EXAMPLES_COUNT', 10),
--- a/src/config/config-schema.ts
+++ b/src/config/config-schema.ts
@@ -266,6 +266,48 @@ export const CONFIG_FIELDS: ConfigFieldMeta[] = [
    max: 300000,
    defaultValue: 10000,
  },
+  {
+    envKey: 'LLM_MAX_CONCURRENT_CALLS',
+    group: 'review',
+    label: 'LLM 最大并发调用',
+    description: '同时在飞的 LLM API 调用上限（防止触发 Provider 并发/RPM 限制）',
+    type: 'number',
+    sensitive: false,
+    min: 1,
+    max: 20,
+    defaultValue: 4,
+  },
+  {
+    envKey: 'LLM_RETRY_MAX_ATTEMPTS',
+    group: 'review',
+    label: 'LLM 重试次数',
+    description: 'LLM 调用失败后的最大重试次数（仅对 429/网络错误生效）',
+    type: 'number',
+    sensitive: false,
+    min: 1,
+    max: 10,
+    defaultValue: 3,
+  },
+  {
+    envKey: 'LLM_RETRY_BASE_DELAY_MS',
+    group: 'review',
+    label: 'LLM 重试基础延迟(ms)',
+    description: '重试时的基础延迟（指数退避，实际延迟 = baseDelay × 2^(attempt-1)）',
+    type: 'number',
+    sensitive: false,
+    min: 100,
+    max: 30000,
+    defaultValue: 1000,
+  },
+  {
+    envKey: 'ENABLE_TRIAGE',
+    group: 'review',
+    label: '启用变更分流',
+    description: '是否启用 Triage 分流（用 Planner 模型先评估变更复杂度，再按需派发 Specialist）',
+    type: 'boolean',
+    sensitive: false,
+    defaultValue: true,
+  },

  // ── 记忆与学习 ──────────────────────────────────────────────────────────
  {
--- a/src/llm/tests/resilience.test.ts
+++ b/src/llm/tests/resilience.test.ts
@@ -0,0 +1,586 @@
+import { describe, expect, test } from 'bun:test';
+import { LLMAuthError, LLMConnectionError, LLMRateLimitError } from '../errors';
+import { LLMSemaphore, retryWithBackoff, withResilience } from '../resilience';
+
+describe('LLMSemaphore', () => {
+  test('constructor throws for maxConcurrent < 1', () => {
+    try {
+      new LLMSemaphore(0);
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.message).toContain('maxConcurrent must be >= 1');
+    }
+  });
+
+  test('constructor throws for negative maxConcurrent', () => {
+    try {
+      new LLMSemaphore(-5);
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.message).toContain('maxConcurrent must be >= 1');
+    }
+  });
+
+  test('constructor accepts valid maxConcurrent', () => {
+    const sem = new LLMSemaphore(5);
+    expect(sem.activeCount).toBe(0);
+    expect(sem.pendingCount).toBe(0);
+  });
+
+  test('acquire() resolves immediately when under limit', async () => {
+    const sem = new LLMSemaphore(2);
+    await sem.acquire();
+    expect(sem.activeCount).toBe(1);
+    expect(sem.pendingCount).toBe(0);
+  });
+
+  test('acquire() resolves multiple times when under limit', async () => {
+    const sem = new LLMSemaphore(3);
+    await sem.acquire();
+    await sem.acquire();
+    expect(sem.activeCount).toBe(2);
+    expect(sem.pendingCount).toBe(0);
+  });
+
+  test('acquire() blocks when at limit', async () => {
+    const sem = new LLMSemaphore(1);
+    await sem.acquire();
+    expect(sem.activeCount).toBe(1);
+
+    let secondAcquireResolved = false;
+    sem.acquire().then(() => {
+      secondAcquireResolved = true;
+    });
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(secondAcquireResolved).toBe(false);
+    expect(sem.pendingCount).toBe(1);
+  });
+
+  test('release() allows blocked acquire() to resume', async () => {
+    const sem = new LLMSemaphore(1);
+    await sem.acquire();
+
+    let secondAcquireResolved = false;
+    const secondPromise = sem.acquire().then(() => {
+      secondAcquireResolved = true;
+    });
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(secondAcquireResolved).toBe(false);
+
+    sem.release();
+
+    await secondPromise;
+    expect(secondAcquireResolved).toBe(true);
+    expect(sem.activeCount).toBe(1);
+  });
+
+  test('activeCount reflects correct state after acquire/release cycle', async () => {
+    const sem = new LLMSemaphore(2);
+    expect(sem.activeCount).toBe(0);
+
+    await sem.acquire();
+    expect(sem.activeCount).toBe(1);
+
+    await sem.acquire();
+    expect(sem.activeCount).toBe(2);
+
+    sem.release();
+    expect(sem.activeCount).toBe(1);
+
+    sem.release();
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('pendingCount reflects queued requests', async () => {
+    const sem = new LLMSemaphore(1);
+    await sem.acquire();
+    expect(sem.pendingCount).toBe(0);
+
+    sem.acquire();
+    sem.acquire();
+    sem.acquire();
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sem.pendingCount).toBe(3);
+  });
+
+  test('multiple concurrent acquires with interleaved releases', async () => {
+    const sem = new LLMSemaphore(2);
+    const sequence: string[] = [];
+
+    await sem.acquire();
+    sequence.push('acquire1');
+    await sem.acquire();
+    sequence.push('acquire2');
+
+    expect(sem.activeCount).toBe(2);
+    expect(sem.pendingCount).toBe(0);
+
+    const p3 = sem.acquire().then(() => sequence.push('acquire3'));
+    const p4 = sem.acquire().then(() => sequence.push('acquire4'));
+    const p5 = sem.acquire().then(() => sequence.push('acquire5'));
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sem.pendingCount).toBe(3);
+
+    sem.release();
+    expect(sem.activeCount).toBe(2);
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sequence).toContain('acquire3');
+
+    sem.release();
+    expect(sem.activeCount).toBe(2);
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sequence).toContain('acquire4');
+
+    sem.release();
+    await p5;
+    expect(sequence).toContain('acquire5');
+  });
+});
+
+describe('retryWithBackoff', () => {
+  test('succeeds on first attempt (no retry)', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, { maxAttempts: 3 });
+    expect(result).toBe('success');
+    expect(callCount).toBe(1);
+  });
+
+  test('retries on LLMRateLimitError, succeeds on 2nd attempt', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        throw new LLMRateLimitError('openai', 1);
+      }
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+  });
+
+  test('retries on LLMConnectionError, succeeds on 2nd attempt', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        throw new LLMConnectionError('openai', 'network timeout');
+      }
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+  });
+
+  test('throws immediately on LLMAuthError (non-retryable)', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new LLMAuthError('openai', 'invalid api key');
+    };
+
+    try {
+      await retryWithBackoff(fn, { maxAttempts: 3 });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMAuthError');
+      expect(callCount).toBe(1);
+    }
+  });
+
+  test('throws immediately on generic Error (non-retryable)', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new Error('some other error');
+    };
+
+    try {
+      await retryWithBackoff(fn, { maxAttempts: 3 });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.message).toBe('some other error');
+      expect(callCount).toBe(1);
+    }
+  });
+
+  test('exhausts maxAttempts and throws last error', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new LLMRateLimitError('openai', undefined, 'rate limited');
+    };
+
+    try {
+      await retryWithBackoff(fn, {
+        maxAttempts: 3,
+        baseDelayMs: 10,
+        maxDelayMs: 100,
+        jitter: false,
+      });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMRateLimitError');
+      expect(callCount).toBe(3);
+    }
+  });
+
+  test('respects retryAfterSeconds from LLMRateLimitError', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMRateLimitError('openai', 0.1);
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(50);
+    expect(actualDelay).toBeLessThan(300);
+  });
+
+  test('exponential backoff without jitter: attempt 1 delay ≈ baseDelayMs', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 20,
+      maxDelayMs: 1000,
+      jitter: false,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(10);
+    expect(actualDelay).toBeLessThan(100);
+  });
+
+  test('exponential backoff attempt 2: baseDelayMs * 2', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        throw new LLMConnectionError('openai');
+      }
+      if (callCount === 2) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 4,
+      baseDelayMs: 20,
+      maxDelayMs: 1000,
+      jitter: false,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(30);
+    expect(actualDelay).toBeLessThan(150);
+  });
+
+  test('custom options: maxAttempts respected', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new LLMConnectionError('openai');
+    };
+
+    try {
+      await retryWithBackoff(fn, { maxAttempts: 2 });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(callCount).toBe(2);
+    }
+  });
+
+  test('custom options: baseDelayMs respected', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 50,
+      maxDelayMs: 1000,
+      jitter: false,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(30);
+    expect(actualDelay).toBeLessThan(200);
+  });
+
+  test('jitter disabled: delay is exactly exponential', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      if (callCount < 3) {
+        throw new LLMConnectionError('openai');
+      }
+      return 'success';
+    };
+
+    const start = Date.now();
+    await retryWithBackoff(fn, {
+      maxAttempts: 4,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+    const total = Date.now() - start;
+
+    expect(total).toBeLessThan(200);
+  });
+
+  test('jitter enabled: adds randomness to delays', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: true,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(5);
+    expect(actualDelay).toBeLessThan(100);
+  });
+});
+
+describe('withResilience', () => {
+  test('acquires semaphore, calls fn, releases semaphore on success', async () => {
+    const sem = new LLMSemaphore(1);
+    let fnCalled = false;
+
+    await withResilience(sem, async () => {
+      fnCalled = true;
+      expect(sem.activeCount).toBe(1);
+      return 'result';
+    });
+
+    expect(fnCalled).toBe(true);
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('releases semaphore on failure', async () => {
+    const sem = new LLMSemaphore(1);
+
+    try {
+      await withResilience(sem, async () => {
+        expect(sem.activeCount).toBe(1);
+        throw new LLMAuthError('openai');
+      });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMAuthError');
+      expect(sem.activeCount).toBe(0);
+    }
+  });
+
+  test('combines semaphore + retry: respects semaphore then retries', async () => {
+    const sem = new LLMSemaphore(1);
+    let callCount = 0;
+
+    const result = await withResilience(
+      sem,
+      async () => {
+        callCount++;
+        expect(sem.activeCount).toBe(1);
+        if (callCount === 1) {
+          throw new LLMRateLimitError('openai', undefined);
+        }
+        return 'success';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+    );
+
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('releases semaphore even if retry fails', async () => {
+    const sem = new LLMSemaphore(1);
+
+    try {
+      await withResilience(
+        sem,
+        async () => {
+          throw new LLMConnectionError('openai');
+        },
+        { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+      );
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMConnectionError');
+      expect(sem.activeCount).toBe(0);
+    }
+  });
+
+  test('respects custom retry options passed to withResilience', async () => {
+    const sem = new LLMSemaphore(1);
+    let callCount = 0;
+
+    try {
+      await withResilience(
+        sem,
+        async () => {
+          callCount++;
+          throw new LLMConnectionError('openai');
+        },
+        { maxAttempts: 3, baseDelayMs: 10, jitter: false }
+      );
+    } catch (e: any) {
+      expect(callCount).toBe(3);
+    }
+  });
+
+  test('concurrent requests respect semaphore limit during retries', async () => {
+    const sem = new LLMSemaphore(1);
+    const sequence: string[] = [];
+    let callCount1 = 0;
+    let callCount2 = 0;
+
+    const promise1 = withResilience(
+      sem,
+      async () => {
+        callCount1++;
+        sequence.push('call1');
+        if (callCount1 === 1) {
+          throw new LLMConnectionError('openai');
+        }
+        return 'result1';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+    );
+
+    const promise2 = withResilience(
+      sem,
+      async () => {
+        callCount2++;
+        sequence.push('call2');
+        return 'result2';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+    );
+
+    const [result1, result2] = await Promise.all([promise1, promise2]);
+    expect(result1).toBe('result1');
+    expect(result2).toBe('result2');
+
+    expect(callCount1).toBe(2);
+    expect(callCount2).toBe(1);
+
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('passes label through to retry logging', async () => {
+    const sem = new LLMSemaphore(1);
+    let callCount = 0;
+
+    const result = await withResilience(
+      sem,
+      async () => {
+        callCount++;
+        if (callCount === 1) {
+          throw new LLMConnectionError('openai');
+        }
+        return 'success';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false },
+      'test-label'
+    );
+
+    expect(result).toBe('success');
+  });
+
+  test('returns function result correctly', async () => {
+    const sem = new LLMSemaphore(2);
+
+    const result = await withResilience(sem, async () => {
+      return { data: 'test', count: 42 };
+    });
+
+    expect(result.data).toBe('test');
+    expect(result.count).toBe(42);
+  });
+});
--- a/src/llm/gateway.ts
+++ b/src/llm/gateway.ts
@@ -6,6 +6,7 @@
 *   2. Load (or cache) LLMProvider instances with decrypted API keys
 *   3. Route chat() calls to the correct adapter
 *   4. Invalidate cache when provider config changes via UI
+ *   5. Concurrency control + retry-with-backoff for resilience
 */

 import { type ModelRole, modelRoleRepo } from '../db/repositories/model-role-repo';
@@ -17,6 +18,7 @@ import type { LLMProvider } from './providers/base';
 import { createGeminiProvider } from './providers/gemini';
 import { createOpenAICompatibleProvider } from './providers/openai-compatible';
 import { createOpenAIResponsesProvider } from './providers/openai-responses';
+import { LLMSemaphore, type RetryOptions, withResilience } from './resilience';
 import type { LLMChatRequest, LLMChatResponse, ProviderType } from './types';

 type ProviderFactoryFn = (config: {
@@ -35,6 +37,24 @@ const PROVIDER_FACTORIES: Record<ProviderType, ProviderFactoryFn> = {

 export class LLMGateway {
  private cache = new Map<string, LLMProvider>();
+  private semaphore: LLMSemaphore;
+  private retryOptions: Partial<RetryOptions>;
+
+  constructor(
+    maxConcurrent = 4,
+    retryOptions?: Partial<RetryOptions>
+  ) {
+    this.semaphore = new LLMSemaphore(maxConcurrent);
+    this.retryOptions = retryOptions ?? {};
+  }
+
+  /**
+   * Reconfigure resilience settings (called when admin changes config via UI).
+   */
+  updateResilienceConfig(maxConcurrent: number, retryOptions?: Partial<RetryOptions>): void {
+    this.semaphore = new LLMSemaphore(maxConcurrent);
+    this.retryOptions = retryOptions ?? this.retryOptions;
+  }

  /**
   * Call LLM by business role. The role determines which provider + model to use.
@@ -47,16 +67,30 @@ export class LLMGateway {
    const assignment = modelRoleRepo.getByRole(role);
    if (!assignment) throw new LLMNoProviderError(role);

-    const provider = this.getOrCreateProvider(assignment.provider_id);
-    return provider.chat({ ...request, model: assignment.model });
+    return withResilience(
+      this.semaphore,
+      () => {
+        const provider = this.getOrCreateProvider(assignment.provider_id);
+        return provider.chat({ ...request, model: assignment.model });
+      },
+      this.retryOptions,
+      role
+    );
  }

  /**
   * Direct call to a specific provider (used for connection testing).
   */
  async chatDirect(providerId: string, request: LLMChatRequest): Promise<LLMChatResponse> {
-    const provider = this.getOrCreateProvider(providerId);
-    return provider.chat(request);
+    return withResilience(
+      this.semaphore,
+      () => {
+        const provider = this.getOrCreateProvider(providerId);
+        return provider.chat(request);
+      },
+      this.retryOptions,
+      `direct:${providerId}`
+    );
  }

  /**
@@ -66,11 +100,18 @@ export class LLMGateway {
    const assignment = modelRoleRepo.getByRole('embedding');
    if (!assignment) throw new LLMNoProviderError('embedding');

-    const provider = this.getOrCreateProvider(assignment.provider_id);
-    if (!provider.embed) {
-      throw new LLMError(`Provider '${provider.type}' does not support embeddings`, provider.type);
-    }
-    return provider.embed(texts);
+    return withResilience(
+      this.semaphore,
+      () => {
+        const provider = this.getOrCreateProvider(assignment.provider_id);
+        if (!provider.embed) {
+          throw new LLMError(`Provider '${provider.type}' does not support embeddings`, provider.type);
+        }
+        return provider.embed(texts);
+      },
+      this.retryOptions,
+      'embedding'
+    );
  }

  /**
--- a/src/llm/resilience.ts
+++ b/src/llm/resilience.ts
@@ -0,0 +1,173 @@
+/**
+ * LLM Resilience Layer — Semaphore + Retry-with-Backoff.
+ *
+ * Provides concurrency control and automatic retry for LLM API calls.
+ * Designed to be integrated into the LLMGateway as a transparent wrapper.
+ */
+
+import { logger } from '../utils/logger';
+import { LLMConnectionError, LLMRateLimitError } from './errors';
+
+// ---------------------------------------------------------------------------
+// Semaphore — limits concurrent in-flight LLM requests
+// ---------------------------------------------------------------------------
+
+export class LLMSemaphore {
+  private current = 0;
+  private readonly queue: Array<() => void> = [];
+
+  constructor(private readonly maxConcurrent: number) {
+    if (maxConcurrent < 1) {
+      throw new Error(`maxConcurrent must be >= 1, got ${maxConcurrent}`);
+    }
+  }
+
+  async acquire(): Promise<void> {
+    if (this.current < this.maxConcurrent) {
+      this.current++;
+      return;
+    }
+    return new Promise<void>((resolve) => {
+      this.queue.push(() => {
+        this.current++;
+        resolve();
+      });
+    });
+  }
+
+  release(): void {
+    this.current--;
+    const next = this.queue.shift();
+    if (next) {
+      next();
+    }
+  }
+
+  /** Current number of in-flight requests. */
+  get activeCount(): number {
+    return this.current;
+  }
+
+  /** Number of requests waiting for a slot. */
+  get pendingCount(): number {
+    return this.queue.length;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Retry with Exponential Backoff
+// ---------------------------------------------------------------------------
+
+export interface RetryOptions {
+  /** Maximum number of attempts (including the first). Default: 3 */
+  maxAttempts: number;
+  /** Base delay in milliseconds. Default: 1000 */
+  baseDelayMs: number;
+  /** Maximum delay cap in milliseconds. Default: 60000 */
+  maxDelayMs: number;
+  /** Whether to add jitter to delays. Default: true */
+  jitter: boolean;
+}
+
+const DEFAULT_RETRY_OPTIONS: RetryOptions = {
+  maxAttempts: 3,
+  baseDelayMs: 1000,
+  maxDelayMs: 60000,
+  jitter: true,
+};
+
+/**
+ * Determines whether an error is retryable.
+ * Only rate limit (429) and connection errors (5xx, network) are retried.
+ */
+function isRetryableError(error: unknown): boolean {
+  return error instanceof LLMRateLimitError || error instanceof LLMConnectionError;
+}
+
+/**
+ * Calculates the delay for a given attempt.
+ * Respects `retryAfterSeconds` from rate limit responses when available.
+ */
+function calculateDelay(attempt: number, error: unknown, options: RetryOptions): number {
+  // If the provider told us when to retry, respect that
+  if (error instanceof LLMRateLimitError && error.retryAfterSeconds) {
+    return error.retryAfterSeconds * 1000;
+  }
+
+  // Exponential backoff: baseDelay * 2^(attempt-1), capped at maxDelay
+  const exponential = Math.min(options.baseDelayMs * 2 ** (attempt - 1), options.maxDelayMs);
+
+  // Add jitter to prevent thundering herd
+  const jitter = options.jitter ? Math.random() * options.baseDelayMs : 0;
+
+  return exponential + jitter;
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Retries an async function with exponential backoff.
+ * Only retries on LLMRateLimitError and LLMConnectionError.
+ * All other errors are thrown immediately.
+ */
+export async function retryWithBackoff<T>(
+  fn: () => Promise<T>,
+  options?: Partial<RetryOptions>,
+  label?: string
+): Promise<T> {
+  const opts = { ...DEFAULT_RETRY_OPTIONS, ...options };
+
+  for (let attempt = 1; attempt <= opts.maxAttempts; attempt++) {
+    try {
+      return await fn();
+    } catch (error) {
+      const isLast = attempt === opts.maxAttempts;
+      const isRetryable = isRetryableError(error);
+
+      if (isLast || !isRetryable) {
+        throw error;
+      }
+
+      const delayMs = calculateDelay(attempt, error, opts);
+      const errorName = error instanceof Error ? error.constructor.name : 'UnknownError';
+
+      logger.warn(
+        `LLM call${label ? ` [${label}]` : ''} failed (${errorName}), retrying in ${Math.round(delayMs)}ms`,
+        {
+          attempt,
+          maxAttempts: opts.maxAttempts,
+          delayMs: Math.round(delayMs),
+          error: error instanceof Error ? error.message : String(error),
+        }
+      );
+
+      await sleep(delayMs);
+    }
+  }
+
+  // TypeScript exhaustiveness — should never reach here
+  throw new Error('retryWithBackoff: unreachable');
+}
+
+// ---------------------------------------------------------------------------
+// Combined: withResilience wraps a call with semaphore + retry
+// ---------------------------------------------------------------------------
+
+/**
+ * Wraps an async function with semaphore-based concurrency control and retry logic.
+ */
+export async function withResilience<T>(
+  semaphore: LLMSemaphore,
+  fn: () => Promise<T>,
+  retryOptions?: Partial<RetryOptions>,
+  label?: string
+): Promise<T> {
+  await semaphore.acquire();
+  try {
+    return await retryWithBackoff(fn, retryOptions, label);
+  } finally {
+    semaphore.release();
+  }
+}