diff --git a/src/config/config-manager.ts b/src/config/config-manager.ts
index 2f7d0d8..82ee16e 100644
--- a/src/config/config-manager.ts
+++ b/src/config/config-manager.ts
@@ -37,6 +37,10 @@ export interface AppConfig {
     enableHumanGate: boolean;
     allowedCommands: string[];
     commandTimeoutMs: number;
+    llmMaxConcurrentCalls: number;
+    llmRetryMaxAttempts: number;
+    llmRetryBaseDelayMs: number;
+    enableTriage: boolean;
     qdrantUrl: string | undefined;
     enableMemory: boolean;
     fewShotExamplesCount: number;
@@ -153,6 +157,10 @@ class ConfigManager {
           'wc',
         ]),
         commandTimeoutMs: toNumber('REVIEW_COMMAND_TIMEOUT_MS', 10000),
+        llmMaxConcurrentCalls: toNumber('LLM_MAX_CONCURRENT_CALLS', 4),
+        llmRetryMaxAttempts: toNumber('LLM_RETRY_MAX_ATTEMPTS', 3),
+        llmRetryBaseDelayMs: toNumber('LLM_RETRY_BASE_DELAY_MS', 1000),
+        enableTriage: toBoolean('ENABLE_TRIAGE', true),
         qdrantUrl: values.QDRANT_URL,
         enableMemory: toBoolean('ENABLE_MEMORY', false),
         fewShotExamplesCount: toNumber('FEW_SHOT_EXAMPLES_COUNT', 10),
diff --git a/src/config/config-schema.ts b/src/config/config-schema.ts
index 1f85012..735ee1a 100644
--- a/src/config/config-schema.ts
+++ b/src/config/config-schema.ts
@@ -266,6 +266,48 @@ export const CONFIG_FIELDS: ConfigFieldMeta[] = [
     max: 300000,
     defaultValue: 10000,
   },
+  {
+    envKey: 'LLM_MAX_CONCURRENT_CALLS',
+    group: 'review',
+    label: 'LLM 最大并发调用',
+    description: '同时在飞的 LLM API 调用上限（防止触发 Provider 并发/RPM 限制）',
+    type: 'number',
+    sensitive: false,
+    min: 1,
+    max: 20,
+    defaultValue: 4,
+  },
+  {
+    envKey: 'LLM_RETRY_MAX_ATTEMPTS',
+    group: 'review',
+    label: 'LLM 重试次数',
+    description: 'LLM 调用失败后的最大重试次数（仅对 429/网络错误生效）',
+    type: 'number',
+    sensitive: false,
+    min: 1,
+    max: 10,
+    defaultValue: 3,
+  },
+  {
+    envKey: 'LLM_RETRY_BASE_DELAY_MS',
+    group: 'review',
+    label: 'LLM 重试基础延迟(ms)',
+    description: '重试时的基础延迟（指数退避，实际延迟 = baseDelay × 2^(attempt-1)）',
+    type: 'number',
+    sensitive: false,
+    min: 100,
+    max: 30000,
+    defaultValue: 1000,
+  },
+  {
+    envKey: 'ENABLE_TRIAGE',
+    group: 'review',
+    label: '启用变更分流',
+    description: '是否启用 Triage 分流（用 Planner 模型先评估变更复杂度，再按需派发 Specialist）',
+    type: 'boolean',
+    sensitive: false,
+    defaultValue: true,
+  },
 
   // ── 记忆与学习 ──────────────────────────────────────────────────────────
   {
diff --git a/src/llm/__tests__/resilience.test.ts b/src/llm/__tests__/resilience.test.ts
new file mode 100644
index 0000000..9680ee0
--- /dev/null
+++ b/src/llm/__tests__/resilience.test.ts
@@ -0,0 +1,586 @@
+import { describe, expect, test } from 'bun:test';
+import { LLMAuthError, LLMConnectionError, LLMRateLimitError } from '../errors';
+import { LLMSemaphore, retryWithBackoff, withResilience } from '../resilience';
+
+describe('LLMSemaphore', () => {
+  test('constructor throws for maxConcurrent < 1', () => {
+    try {
+      new LLMSemaphore(0);
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.message).toContain('maxConcurrent must be >= 1');
+    }
+  });
+
+  test('constructor throws for negative maxConcurrent', () => {
+    try {
+      new LLMSemaphore(-5);
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.message).toContain('maxConcurrent must be >= 1');
+    }
+  });
+
+  test('constructor accepts valid maxConcurrent', () => {
+    const sem = new LLMSemaphore(5);
+    expect(sem.activeCount).toBe(0);
+    expect(sem.pendingCount).toBe(0);
+  });
+
+  test('acquire() resolves immediately when under limit', async () => {
+    const sem = new LLMSemaphore(2);
+    await sem.acquire();
+    expect(sem.activeCount).toBe(1);
+    expect(sem.pendingCount).toBe(0);
+  });
+
+  test('acquire() resolves multiple times when under limit', async () => {
+    const sem = new LLMSemaphore(3);
+    await sem.acquire();
+    await sem.acquire();
+    expect(sem.activeCount).toBe(2);
+    expect(sem.pendingCount).toBe(0);
+  });
+
+  test('acquire() blocks when at limit', async () => {
+    const sem = new LLMSemaphore(1);
+    await sem.acquire();
+    expect(sem.activeCount).toBe(1);
+
+    let secondAcquireResolved = false;
+    sem.acquire().then(() => {
+      secondAcquireResolved = true;
+    });
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(secondAcquireResolved).toBe(false);
+    expect(sem.pendingCount).toBe(1);
+  });
+
+  test('release() allows blocked acquire() to resume', async () => {
+    const sem = new LLMSemaphore(1);
+    await sem.acquire();
+
+    let secondAcquireResolved = false;
+    const secondPromise = sem.acquire().then(() => {
+      secondAcquireResolved = true;
+    });
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(secondAcquireResolved).toBe(false);
+
+    sem.release();
+
+    await secondPromise;
+    expect(secondAcquireResolved).toBe(true);
+    expect(sem.activeCount).toBe(1);
+  });
+
+  test('activeCount reflects correct state after acquire/release cycle', async () => {
+    const sem = new LLMSemaphore(2);
+    expect(sem.activeCount).toBe(0);
+
+    await sem.acquire();
+    expect(sem.activeCount).toBe(1);
+
+    await sem.acquire();
+    expect(sem.activeCount).toBe(2);
+
+    sem.release();
+    expect(sem.activeCount).toBe(1);
+
+    sem.release();
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('pendingCount reflects queued requests', async () => {
+    const sem = new LLMSemaphore(1);
+    await sem.acquire();
+    expect(sem.pendingCount).toBe(0);
+
+    sem.acquire();
+    sem.acquire();
+    sem.acquire();
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sem.pendingCount).toBe(3);
+  });
+
+  test('multiple concurrent acquires with interleaved releases', async () => {
+    const sem = new LLMSemaphore(2);
+    const sequence: string[] = [];
+
+    await sem.acquire();
+    sequence.push('acquire1');
+    await sem.acquire();
+    sequence.push('acquire2');
+
+    expect(sem.activeCount).toBe(2);
+    expect(sem.pendingCount).toBe(0);
+
+    const p3 = sem.acquire().then(() => sequence.push('acquire3'));
+    const p4 = sem.acquire().then(() => sequence.push('acquire4'));
+    const p5 = sem.acquire().then(() => sequence.push('acquire5'));
+
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sem.pendingCount).toBe(3);
+
+    sem.release();
+    expect(sem.activeCount).toBe(2);
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sequence).toContain('acquire3');
+
+    sem.release();
+    expect(sem.activeCount).toBe(2);
+    await new Promise((resolve) => setTimeout(resolve, 10));
+    expect(sequence).toContain('acquire4');
+
+    sem.release();
+    await p5;
+    expect(sequence).toContain('acquire5');
+  });
+});
+
+describe('retryWithBackoff', () => {
+  test('succeeds on first attempt (no retry)', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, { maxAttempts: 3 });
+    expect(result).toBe('success');
+    expect(callCount).toBe(1);
+  });
+
+  test('retries on LLMRateLimitError, succeeds on 2nd attempt', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        throw new LLMRateLimitError('openai', 1);
+      }
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+  });
+
+  test('retries on LLMConnectionError, succeeds on 2nd attempt', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        throw new LLMConnectionError('openai', 'network timeout');
+      }
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+  });
+
+  test('throws immediately on LLMAuthError (non-retryable)', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new LLMAuthError('openai', 'invalid api key');
+    };
+
+    try {
+      await retryWithBackoff(fn, { maxAttempts: 3 });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMAuthError');
+      expect(callCount).toBe(1);
+    }
+  });
+
+  test('throws immediately on generic Error (non-retryable)', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new Error('some other error');
+    };
+
+    try {
+      await retryWithBackoff(fn, { maxAttempts: 3 });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.message).toBe('some other error');
+      expect(callCount).toBe(1);
+    }
+  });
+
+  test('exhausts maxAttempts and throws last error', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new LLMRateLimitError('openai', undefined, 'rate limited');
+    };
+
+    try {
+      await retryWithBackoff(fn, {
+        maxAttempts: 3,
+        baseDelayMs: 10,
+        maxDelayMs: 100,
+        jitter: false,
+      });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMRateLimitError');
+      expect(callCount).toBe(3);
+    }
+  });
+
+  test('respects retryAfterSeconds from LLMRateLimitError', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMRateLimitError('openai', 0.1);
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    const result = await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(50);
+    expect(actualDelay).toBeLessThan(300);
+  });
+
+  test('exponential backoff without jitter: attempt 1 delay ≈ baseDelayMs', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 20,
+      maxDelayMs: 1000,
+      jitter: false,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(10);
+    expect(actualDelay).toBeLessThan(100);
+  });
+
+  test('exponential backoff attempt 2: baseDelayMs * 2', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        throw new LLMConnectionError('openai');
+      }
+      if (callCount === 2) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 4,
+      baseDelayMs: 20,
+      maxDelayMs: 1000,
+      jitter: false,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(30);
+    expect(actualDelay).toBeLessThan(150);
+  });
+
+  test('custom options: maxAttempts respected', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      throw new LLMConnectionError('openai');
+    };
+
+    try {
+      await retryWithBackoff(fn, { maxAttempts: 2 });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(callCount).toBe(2);
+    }
+  });
+
+  test('custom options: baseDelayMs respected', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 50,
+      maxDelayMs: 1000,
+      jitter: false,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(30);
+    expect(actualDelay).toBeLessThan(200);
+  });
+
+  test('jitter disabled: delay is exactly exponential', async () => {
+    let callCount = 0;
+    const fn = async () => {
+      callCount++;
+      if (callCount < 3) {
+        throw new LLMConnectionError('openai');
+      }
+      return 'success';
+    };
+
+    const start = Date.now();
+    await retryWithBackoff(fn, {
+      maxAttempts: 4,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: false,
+    });
+    const total = Date.now() - start;
+
+    expect(total).toBeLessThan(200);
+  });
+
+  test('jitter enabled: adds randomness to delays', async () => {
+    let callCount = 0;
+    let delayStartTime = 0;
+    let delayEndTime = 0;
+
+    const fn = async () => {
+      callCount++;
+      if (callCount === 1) {
+        delayStartTime = Date.now();
+        throw new LLMConnectionError('openai');
+      }
+      delayEndTime = Date.now();
+      return 'success';
+    };
+
+    await retryWithBackoff(fn, {
+      maxAttempts: 3,
+      baseDelayMs: 10,
+      maxDelayMs: 100,
+      jitter: true,
+    });
+
+    const actualDelay = delayEndTime - delayStartTime;
+    expect(actualDelay).toBeGreaterThanOrEqual(5);
+    expect(actualDelay).toBeLessThan(100);
+  });
+});
+
+describe('withResilience', () => {
+  test('acquires semaphore, calls fn, releases semaphore on success', async () => {
+    const sem = new LLMSemaphore(1);
+    let fnCalled = false;
+
+    await withResilience(sem, async () => {
+      fnCalled = true;
+      expect(sem.activeCount).toBe(1);
+      return 'result';
+    });
+
+    expect(fnCalled).toBe(true);
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('releases semaphore on failure', async () => {
+    const sem = new LLMSemaphore(1);
+
+    try {
+      await withResilience(sem, async () => {
+        expect(sem.activeCount).toBe(1);
+        throw new LLMAuthError('openai');
+      });
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMAuthError');
+      expect(sem.activeCount).toBe(0);
+    }
+  });
+
+  test('combines semaphore + retry: respects semaphore then retries', async () => {
+    const sem = new LLMSemaphore(1);
+    let callCount = 0;
+
+    const result = await withResilience(
+      sem,
+      async () => {
+        callCount++;
+        expect(sem.activeCount).toBe(1);
+        if (callCount === 1) {
+          throw new LLMRateLimitError('openai', undefined);
+        }
+        return 'success';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+    );
+
+    expect(result).toBe('success');
+    expect(callCount).toBe(2);
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('releases semaphore even if retry fails', async () => {
+    const sem = new LLMSemaphore(1);
+
+    try {
+      await withResilience(
+        sem,
+        async () => {
+          throw new LLMConnectionError('openai');
+        },
+        { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+      );
+      expect(true).toBe(false);
+    } catch (e: any) {
+      expect(e.name).toBe('LLMConnectionError');
+      expect(sem.activeCount).toBe(0);
+    }
+  });
+
+  test('respects custom retry options passed to withResilience', async () => {
+    const sem = new LLMSemaphore(1);
+    let callCount = 0;
+
+    try {
+      await withResilience(
+        sem,
+        async () => {
+          callCount++;
+          throw new LLMConnectionError('openai');
+        },
+        { maxAttempts: 3, baseDelayMs: 10, jitter: false }
+      );
+    } catch (e: any) {
+      expect(callCount).toBe(3);
+    }
+  });
+
+  test('concurrent requests respect semaphore limit during retries', async () => {
+    const sem = new LLMSemaphore(1);
+    const sequence: string[] = [];
+    let callCount1 = 0;
+    let callCount2 = 0;
+
+    const promise1 = withResilience(
+      sem,
+      async () => {
+        callCount1++;
+        sequence.push('call1');
+        if (callCount1 === 1) {
+          throw new LLMConnectionError('openai');
+        }
+        return 'result1';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+    );
+
+    const promise2 = withResilience(
+      sem,
+      async () => {
+        callCount2++;
+        sequence.push('call2');
+        return 'result2';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false }
+    );
+
+    const [result1, result2] = await Promise.all([promise1, promise2]);
+    expect(result1).toBe('result1');
+    expect(result2).toBe('result2');
+
+    expect(callCount1).toBe(2);
+    expect(callCount2).toBe(1);
+
+    expect(sem.activeCount).toBe(0);
+  });
+
+  test('passes label through to retry logging', async () => {
+    const sem = new LLMSemaphore(1);
+    let callCount = 0;
+
+    const result = await withResilience(
+      sem,
+      async () => {
+        callCount++;
+        if (callCount === 1) {
+          throw new LLMConnectionError('openai');
+        }
+        return 'success';
+      },
+      { maxAttempts: 2, baseDelayMs: 10, jitter: false },
+      'test-label'
+    );
+
+    expect(result).toBe('success');
+  });
+
+  test('returns function result correctly', async () => {
+    const sem = new LLMSemaphore(2);
+
+    const result = await withResilience(sem, async () => {
+      return { data: 'test', count: 42 };
+    });
+
+    expect(result.data).toBe('test');
+    expect(result.count).toBe(42);
+  });
+});
diff --git a/src/llm/gateway.ts b/src/llm/gateway.ts
index 2f40cde..58d615f 100644
--- a/src/llm/gateway.ts
+++ b/src/llm/gateway.ts
@@ -6,6 +6,7 @@
  *   2. Load (or cache) LLMProvider instances with decrypted API keys
  *   3. Route chat() calls to the correct adapter
  *   4. Invalidate cache when provider config changes via UI
+ *   5. Concurrency control + retry-with-backoff for resilience
  */
 
 import { type ModelRole, modelRoleRepo } from '../db/repositories/model-role-repo';
@@ -17,6 +18,7 @@ import type { LLMProvider } from './providers/base';
 import { createGeminiProvider } from './providers/gemini';
 import { createOpenAICompatibleProvider } from './providers/openai-compatible';
 import { createOpenAIResponsesProvider } from './providers/openai-responses';
+import { LLMSemaphore, type RetryOptions, withResilience } from './resilience';
 import type { LLMChatRequest, LLMChatResponse, ProviderType } from './types';
 
 type ProviderFactoryFn = (config: {
@@ -35,6 +37,24 @@ const PROVIDER_FACTORIES: Record<ProviderType, ProviderFactoryFn> = {
 
 export class LLMGateway {
   private cache = new Map<string, LLMProvider>();
+  private semaphore: LLMSemaphore;
+  private retryOptions: Partial<RetryOptions>;
+
+  constructor(
+    maxConcurrent = 4,
+    retryOptions?: Partial<RetryOptions>
+  ) {
+    this.semaphore = new LLMSemaphore(maxConcurrent);
+    this.retryOptions = retryOptions ?? {};
+  }
+
+  /**
+   * Reconfigure resilience settings (called when admin changes config via UI).
+   */
+  updateResilienceConfig(maxConcurrent: number, retryOptions?: Partial<RetryOptions>): void {
+    this.semaphore = new LLMSemaphore(maxConcurrent);
+    this.retryOptions = retryOptions ?? this.retryOptions;
+  }
 
   /**
    * Call LLM by business role. The role determines which provider + model to use.
@@ -47,16 +67,30 @@ export class LLMGateway {
     const assignment = modelRoleRepo.getByRole(role);
     if (!assignment) throw new LLMNoProviderError(role);
 
-    const provider = this.getOrCreateProvider(assignment.provider_id);
-    return provider.chat({ ...request, model: assignment.model });
+    return withResilience(
+      this.semaphore,
+      () => {
+        const provider = this.getOrCreateProvider(assignment.provider_id);
+        return provider.chat({ ...request, model: assignment.model });
+      },
+      this.retryOptions,
+      role
+    );
   }
 
   /**
    * Direct call to a specific provider (used for connection testing).
    */
   async chatDirect(providerId: string, request: LLMChatRequest): Promise<LLMChatResponse> {
-    const provider = this.getOrCreateProvider(providerId);
-    return provider.chat(request);
+    return withResilience(
+      this.semaphore,
+      () => {
+        const provider = this.getOrCreateProvider(providerId);
+        return provider.chat(request);
+      },
+      this.retryOptions,
+      `direct:${providerId}`
+    );
   }
 
   /**
@@ -66,11 +100,18 @@ export class LLMGateway {
     const assignment = modelRoleRepo.getByRole('embedding');
     if (!assignment) throw new LLMNoProviderError('embedding');
 
-    const provider = this.getOrCreateProvider(assignment.provider_id);
-    if (!provider.embed) {
-      throw new LLMError(`Provider '${provider.type}' does not support embeddings`, provider.type);
-    }
-    return provider.embed(texts);
+    return withResilience(
+      this.semaphore,
+      () => {
+        const provider = this.getOrCreateProvider(assignment.provider_id);
+        if (!provider.embed) {
+          throw new LLMError(`Provider '${provider.type}' does not support embeddings`, provider.type);
+        }
+        return provider.embed(texts);
+      },
+      this.retryOptions,
+      'embedding'
+    );
   }
 
   /**
diff --git a/src/llm/resilience.ts b/src/llm/resilience.ts
new file mode 100644
index 0000000..76e8593
--- /dev/null
+++ b/src/llm/resilience.ts
@@ -0,0 +1,173 @@
+/**
+ * LLM Resilience Layer — Semaphore + Retry-with-Backoff.
+ *
+ * Provides concurrency control and automatic retry for LLM API calls.
+ * Designed to be integrated into the LLMGateway as a transparent wrapper.
+ */
+
+import { logger } from '../utils/logger';
+import { LLMConnectionError, LLMRateLimitError } from './errors';
+
+// ---------------------------------------------------------------------------
+// Semaphore — limits concurrent in-flight LLM requests
+// ---------------------------------------------------------------------------
+
+export class LLMSemaphore {
+  private current = 0;
+  private readonly queue: Array<() => void> = [];
+
+  constructor(private readonly maxConcurrent: number) {
+    if (maxConcurrent < 1) {
+      throw new Error(`maxConcurrent must be >= 1, got ${maxConcurrent}`);
+    }
+  }
+
+  async acquire(): Promise<void> {
+    if (this.current < this.maxConcurrent) {
+      this.current++;
+      return;
+    }
+    return new Promise<void>((resolve) => {
+      this.queue.push(() => {
+        this.current++;
+        resolve();
+      });
+    });
+  }
+
+  release(): void {
+    this.current--;
+    const next = this.queue.shift();
+    if (next) {
+      next();
+    }
+  }
+
+  /** Current number of in-flight requests. */
+  get activeCount(): number {
+    return this.current;
+  }
+
+  /** Number of requests waiting for a slot. */
+  get pendingCount(): number {
+    return this.queue.length;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Retry with Exponential Backoff
+// ---------------------------------------------------------------------------
+
+export interface RetryOptions {
+  /** Maximum number of attempts (including the first). Default: 3 */
+  maxAttempts: number;
+  /** Base delay in milliseconds. Default: 1000 */
+  baseDelayMs: number;
+  /** Maximum delay cap in milliseconds. Default: 60000 */
+  maxDelayMs: number;
+  /** Whether to add jitter to delays. Default: true */
+  jitter: boolean;
+}
+
+const DEFAULT_RETRY_OPTIONS: RetryOptions = {
+  maxAttempts: 3,
+  baseDelayMs: 1000,
+  maxDelayMs: 60000,
+  jitter: true,
+};
+
+/**
+ * Determines whether an error is retryable.
+ * Only rate limit (429) and connection errors (5xx, network) are retried.
+ */
+function isRetryableError(error: unknown): boolean {
+  return error instanceof LLMRateLimitError || error instanceof LLMConnectionError;
+}
+
+/**
+ * Calculates the delay for a given attempt.
+ * Respects `retryAfterSeconds` from rate limit responses when available.
+ */
+function calculateDelay(attempt: number, error: unknown, options: RetryOptions): number {
+  // If the provider told us when to retry, respect that
+  if (error instanceof LLMRateLimitError && error.retryAfterSeconds) {
+    return error.retryAfterSeconds * 1000;
+  }
+
+  // Exponential backoff: baseDelay * 2^(attempt-1), capped at maxDelay
+  const exponential = Math.min(options.baseDelayMs * 2 ** (attempt - 1), options.maxDelayMs);
+
+  // Add jitter to prevent thundering herd
+  const jitter = options.jitter ? Math.random() * options.baseDelayMs : 0;
+
+  return exponential + jitter;
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Retries an async function with exponential backoff.
+ * Only retries on LLMRateLimitError and LLMConnectionError.
+ * All other errors are thrown immediately.
+ */
+export async function retryWithBackoff<T>(
+  fn: () => Promise<T>,
+  options?: Partial<RetryOptions>,
+  label?: string
+): Promise<T> {
+  const opts = { ...DEFAULT_RETRY_OPTIONS, ...options };
+
+  for (let attempt = 1; attempt <= opts.maxAttempts; attempt++) {
+    try {
+      return await fn();
+    } catch (error) {
+      const isLast = attempt === opts.maxAttempts;
+      const isRetryable = isRetryableError(error);
+
+      if (isLast || !isRetryable) {
+        throw error;
+      }
+
+      const delayMs = calculateDelay(attempt, error, opts);
+      const errorName = error instanceof Error ? error.constructor.name : 'UnknownError';
+
+      logger.warn(
+        `LLM call${label ? ` [${label}]` : ''} failed (${errorName}), retrying in ${Math.round(delayMs)}ms`,
+        {
+          attempt,
+          maxAttempts: opts.maxAttempts,
+          delayMs: Math.round(delayMs),
+          error: error instanceof Error ? error.message : String(error),
+        }
+      );
+
+      await sleep(delayMs);
+    }
+  }
+
+  // TypeScript exhaustiveness — should never reach here
+  throw new Error('retryWithBackoff: unreachable');
+}
+
+// ---------------------------------------------------------------------------
+// Combined: withResilience wraps a call with semaphore + retry
+// ---------------------------------------------------------------------------
+
+/**
+ * Wraps an async function with semaphore-based concurrency control and retry logic.
+ */
+export async function withResilience<T>(
+  semaphore: LLMSemaphore,
+  fn: () => Promise<T>,
+  retryOptions?: Partial<RetryOptions>,
+  label?: string
+): Promise<T> {
+  await semaphore.acquire();
+  try {
+    return await retryWithBackoff(fn, retryOptions, label);
+  } finally {
+    semaphore.release();
+  }
+}