diff --git a/src/review/context/diff-extractor.ts b/src/review/context/diff-extractor.ts new file mode 100644 index 0000000..293c5a0 --- /dev/null +++ b/src/review/context/diff-extractor.ts @@ -0,0 +1,306 @@ +import { readFile, lstat } from 'node:fs/promises'; +import path from 'node:path'; +import { DiffFile, ReviewContext, ReviewRun, ChangedFile } from '../types'; +import { SandboxExec } from './sandbox-exec'; +import { LocalRepoManager } from './local-repo-manager'; + +function toStatus(status: string): ChangedFile['status'] { + const value = status.trim().charAt(0).toUpperCase(); + if (['A', 'M', 'D', 'R', 'C', 'T', 'U', 'X', 'B'].includes(value)) { + return value as ChangedFile['status']; + } + return 'M'; +} + +function safePath(basePath: string, relativePath: string): string { + const resolved = path.resolve(basePath, relativePath); + if (!resolved.startsWith(path.resolve(basePath))) { + throw new Error(`非法文件路径: ${relativePath}`); + } + return resolved; +} + +export class DiffExtractor { + constructor( + private readonly sandboxExec: SandboxExec, + private readonly localRepoManager: LocalRepoManager, + private readonly commandTimeoutMs: number, + private readonly maxFilesPerRun: number, + private readonly maxFileContentChars: number + ) {} + + getSandbox(): SandboxExec { + return this.sandboxExec; + } + + async buildContext(run: ReviewRun, mirrorPath: string, workspacePath: string): Promise { + const targetSha = run.headSha || run.commitSha; + if (!targetSha) { + throw new Error('缺少 target sha,无法构建审查上下文'); + } + + let baseSha = run.baseSha; + if (!baseSha) { + baseSha = await this.localRepoManager.resolveCommitParent(workspacePath, targetSha) || undefined; + } + + // Root commit场景:没有parent,使用git show获取完整diff + const isRootCommit = !baseSha; + const diff = isRootCommit + ? await this.getRootCommitDiff(workspacePath, targetSha) + : await this.getDiff(workspacePath, run.eventType, baseSha!, targetSha); + + const changedFiles = isRootCommit + ? await this.getRootCommitChangedFiles(workspacePath, targetSha) + : await this.getChangedFiles(workspacePath, baseSha!, targetSha); + + // 构建允许的文件路径集合,确保parsedDiff也受REVIEW_MAX_FILES_PER_RUN限制 + const allowedPaths = new Set(changedFiles.map(f => f.path)); + const parsedDiff = this.parseDiff(diff, allowedPaths); + + const fileContents = await this.readChangedFileContents(workspacePath, changedFiles); + + return { + workspacePath, + mirrorPath, + diff, + changedFiles, + parsedDiff, + fileContents, + }; + } + + private async getRootCommitDiff(workspacePath: string, sha: string): Promise { + // Root commit:使用git show获取完整diff(相当于与空树的diff) + const response = await this.sandboxExec.run('git', ['show', '--format=', '--unified=3', sha], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + return response.stdout; + } + + private async getDiff( + workspacePath: string, + eventType: ReviewRun['eventType'], + baseSha: string, + targetSha: string + ): Promise { + if (eventType === 'pull_request') { + const response = await this.sandboxExec.run('git', ['diff', '--unified=3', `${baseSha}...${targetSha}`], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + return response.stdout; + } + + const response = await this.sandboxExec.run('git', ['show', '--format=', '--unified=3', targetSha], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + return response.stdout; + } + + private async getRootCommitChangedFiles(workspacePath: string, sha: string): Promise { + // Root commit:所有文件都是新增的(A状态) + // --root flag是必需的,否则diff-tree对root commit返回空输出 + const statusResult = await this.sandboxExec.run('git', ['diff-tree', '--root', '--no-commit-id', '--name-status', '-r', sha], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + + const numStatResult = await this.sandboxExec.run('git', ['diff-tree', '--root', '--no-commit-id', '--numstat', '-r', sha], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + + const numMap = new Map(); + for (const line of numStatResult.stdout.split('\n')) { + if (!line.trim()) { + continue; + } + const [addRaw = '0', delRaw = '0', filename] = line.split('\t'); + if (!filename) { + continue; + } + const additions = Number.parseInt(addRaw, 10); + const deletions = Number.parseInt(delRaw, 10); + numMap.set(filename, { + additions: Number.isFinite(additions) ? additions : 0, + deletions: Number.isFinite(deletions) ? deletions : 0, + }); + } + + const changedFiles: ChangedFile[] = []; + for (const line of statusResult.stdout.split('\n')) { + if (!line.trim()) { + continue; + } + const [statusRaw = 'A', ...pathParts] = line.split('\t'); + const filePath = pathParts[pathParts.length - 1]; + if (!filePath) { + continue; + } + const stats = numMap.get(filePath) || { additions: 0, deletions: 0 }; + changedFiles.push({ + path: filePath, + status: toStatus(statusRaw), + additions: stats.additions, + deletions: stats.deletions, + }); + if (changedFiles.length >= this.maxFilesPerRun) { + break; + } + } + + return changedFiles; + } + + private async getChangedFiles(workspacePath: string, baseSha: string, targetSha: string): Promise { + const statusResult = await this.sandboxExec.run('git', ['diff', '--name-status', `${baseSha}...${targetSha}`], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + + const numStatResult = await this.sandboxExec.run('git', ['diff', '--numstat', `${baseSha}...${targetSha}`], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + + const numMap = new Map(); + for (const line of numStatResult.stdout.split('\n')) { + if (!line.trim()) { + continue; + } + const [addRaw = '0', delRaw = '0', filename] = line.split('\t'); + if (!filename) { + continue; + } + const additions = Number.parseInt(addRaw, 10); + const deletions = Number.parseInt(delRaw, 10); + numMap.set(filename, { + additions: Number.isFinite(additions) ? additions : 0, + deletions: Number.isFinite(deletions) ? deletions : 0, + }); + } + + const changedFiles: ChangedFile[] = []; + for (const line of statusResult.stdout.split('\n')) { + if (!line.trim()) { + continue; + } + const [statusRaw = 'M', ...pathParts] = line.split('\t'); + const filePath = pathParts[pathParts.length - 1]; + if (!filePath) { + continue; + } + const stats = numMap.get(filePath) || { additions: 0, deletions: 0 }; + changedFiles.push({ + path: filePath, + status: toStatus(statusRaw), + additions: stats.additions, + deletions: stats.deletions, + }); + if (changedFiles.length >= this.maxFilesPerRun) { + break; + } + } + + return changedFiles; + } + + private async readChangedFileContents( + workspacePath: string, + changedFiles: ChangedFile[] + ): Promise> { + const result: Record = {}; + + for (const file of changedFiles) { + if (file.status === 'D') { + continue; + } + try { + const filePath = safePath(workspacePath, file.path); + + // 安全检查:拒绝符号链接以防止主机文件泄露 + const stats = await lstat(filePath); + if (stats.isSymbolicLink()) { + continue; + } + + const content = await readFile(filePath, 'utf-8'); + result[file.path] = content.slice(0, this.maxFileContentChars); + } catch { + continue; + } + } + + return result; + } + + parseDiff(diffContent: string, allowedPaths?: Set): DiffFile[] { + const files: DiffFile[] = []; + const lines = diffContent.split('\n'); + + let currentFile: DiffFile | null = null; + let lineNumber = 0; + let inHunk = false; + let skipCurrentFile = false; + + for (const line of lines) { + if (line.startsWith('diff --git')) { + if (currentFile && !skipCurrentFile) { + files.push(currentFile); + } + currentFile = { path: '', changes: [] }; + inHunk = false; + skipCurrentFile = false; + continue; + } + + if (!currentFile) { + continue; + } + + if (line.startsWith('+++ b/')) { + currentFile.path = line.substring(6); + // 如果提供了allowedPaths,检查当前文件是否在允许列表中 + if (allowedPaths && !allowedPaths.has(currentFile.path)) { + skipCurrentFile = true; + } + continue; + } + + // 如果跳过当前文件,忽略所有后续内容直到下一个文件 + if (skipCurrentFile) { + continue; + } + + if (line.startsWith('@@')) { + const match = line.match(/@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/); + if (match && match[1]) { + lineNumber = Number.parseInt(match[1], 10) - 1; + inHunk = true; + } + continue; + } + + if (!inHunk) { + continue; + } + + if (line.startsWith('+')) { + lineNumber += 1; + currentFile.changes.push({ lineNumber, content: line.slice(1), type: 'add' }); + } else if (line.startsWith(' ')) { + lineNumber += 1; + currentFile.changes.push({ lineNumber, content: line.slice(1), type: 'context' }); + } + } + + if (currentFile && !skipCurrentFile) { + files.push(currentFile); + } + + return files.filter((file) => file.path && file.changes.length > 0); + } +} diff --git a/src/review/context/local-repo-manager.ts b/src/review/context/local-repo-manager.ts new file mode 100644 index 0000000..f4fc818 --- /dev/null +++ b/src/review/context/local-repo-manager.ts @@ -0,0 +1,219 @@ +import { access, mkdir, rm } from 'node:fs/promises'; +import path from 'node:path'; +import { createHash } from 'node:crypto'; +import { SandboxExec } from './sandbox-exec'; +import { logger } from '../../utils/logger'; + +export interface LocalRepoPaths { + mirrorPath: string; + workspacePath: string; +} + +function hashRepo(owner: string, repo: string): string { + return createHash('sha256').update(`${owner}/${repo}`).digest('hex').slice(0, 16); +} + +export class LocalRepoManager { + private mirrorLocks = new Map>(); + + constructor( + private readonly workDir: string, + private readonly sandboxExec: SandboxExec, + private readonly commandTimeoutMs: number, + private readonly giteaToken?: string + ) {} + + /** + * 构建git命令的认证配置参数(非持久化) + * 使用http.extraHeader避免将token存储在git config中 + */ + private getAuthArgs(): string[] { + if (!this.giteaToken) { + return []; + } + // 使用Authorization header,不会持久化到.git/config + return ['-c', `http.extraHeader=Authorization: token ${this.giteaToken}`]; + } + + /** + * 获取mirror仓库的互斥锁,防止并发修改同一mirror + * 返回一个unlock函数,调用者必须在完成后调用 + */ + private async acquireMirrorLock(mirrorPath: string): Promise<() => void> { + // 获取前一个锁(如果有),用于排队等待 + const currentLock = this.mirrorLocks.get(mirrorPath) || Promise.resolve(); + + let releaseLock: () => void; + const newLock = new Promise((resolve) => { + releaseLock = resolve; + }); + + // 将新锁存入map(供后续调用者排队) + // 修复:直接存储newLock而非chain,使unlock时的比较能够正确工作 + this.mirrorLocks.set(mirrorPath, newLock); + + // 等待前一个锁完成 + await currentLock; + + // 返回解锁函数 + return () => { + releaseLock!(); + // 如果map中还是当前锁(没有新的等待者),清理以避免内存泄漏 + if (this.mirrorLocks.get(mirrorPath) === newLock) { + this.mirrorLocks.delete(mirrorPath); + } + }; + } + + async prepareWorkspace( + owner: string, + repo: string, + cloneUrl: string, + targetSha: string, + runId: string, + headCloneUrl?: string + ): Promise { + const repoHash = hashRepo(owner, repo); + const mirrorsRoot = path.join(this.workDir, 'repos'); + const workspacesRoot = path.join(this.workDir, 'workspaces'); + const mirrorPath = path.join(mirrorsRoot, `${repoHash}.git`); + const workspacePath = path.join(workspacesRoot, runId); + + await mkdir(mirrorsRoot, { recursive: true }); + await mkdir(workspacesRoot, { recursive: true }); + + // 获取mirror锁,防止并发修改同一mirror(remote set-url/fetch冲突) + const unlock = await this.acquireMirrorLock(mirrorPath); + + try { + const authArgs = this.getAuthArgs(); + const mirrorExists = await this.pathExists(mirrorPath); + + if (!mirrorExists) { + logger.info('创建本地 mirror 仓库', { owner, repo, mirrorPath }); + await this.sandboxExec.run('git', [...authArgs, 'clone', '--mirror', cloneUrl, mirrorPath], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + } else { + // 更新remote URL(不含认证信息) + await this.sandboxExec.run('git', ['--git-dir', mirrorPath, 'remote', 'set-url', 'origin', cloneUrl], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + // fetch使用认证参数 + await this.sandboxExec.run( + 'git', + [...authArgs, '--git-dir', mirrorPath, 'fetch', '--prune', 'origin', '+refs/*:refs/*'], + { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + } + ); + } + + // Fork PR场景:添加head remote并fetch,确保head SHA可用 + if (headCloneUrl && headCloneUrl !== cloneUrl) { + logger.info('Fork PR检测,添加head remote', { owner, repo, headCloneUrl }); + + // 检查head remote是否已存在,存在则更新URL + const remoteListResult = await this.sandboxExec.run('git', ['--git-dir', mirrorPath, 'remote'], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + const hasHeadRemote = remoteListResult.stdout.includes('head'); + + if (hasHeadRemote) { + await this.sandboxExec.run('git', ['--git-dir', mirrorPath, 'remote', 'set-url', 'head', headCloneUrl], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + } else { + await this.sandboxExec.run('git', ['--git-dir', mirrorPath, 'remote', 'add', 'head', headCloneUrl], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + } + + // Fetch head remote + await this.sandboxExec.run( + 'git', + [...authArgs, '--git-dir', mirrorPath, 'fetch', 'head', '+refs/heads/*:refs/remotes/head/*'], + { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + } + ); + } + + await rm(workspacePath, { recursive: true, force: true }); + + // 清理可能存在的stale worktree元数据(崩溃恢复时目录已删除但元数据仍注册) + // prune会移除所有已删除但仍注册的worktree + // 注意:prune/add也会修改mirror元数据,必须在锁保护下执行,防止并发冲突 + await this.sandboxExec.run('git', ['--git-dir', mirrorPath, 'worktree', 'prune'], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + + await this.sandboxExec.run('git', ['--git-dir', mirrorPath, 'worktree', 'add', '--detach', workspacePath, targetSha], { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + }); + } finally { + // 确保锁总是被释放,在所有mirror-mutating操作(fetch/prune/add)完成后释放 + unlock(); + } + + return { + mirrorPath, + workspacePath, + }; + } + + async cleanupWorkspace(paths: LocalRepoPaths): Promise { + // worktree remove也会修改mirror元数据,需要使用mirror锁防止与prepareWorkspace并发冲突 + const unlock = await this.acquireMirrorLock(paths.mirrorPath); + + try { + await this.sandboxExec.run( + 'git', + ['--git-dir', paths.mirrorPath, 'worktree', 'remove', '--force', paths.workspacePath], + { + cwd: this.workDir, + timeoutMs: this.commandTimeoutMs, + } + ); + } catch (error) { + logger.warn('移除 git worktree 失败,尝试直接清理目录', { + workspacePath: paths.workspacePath, + error: error instanceof Error ? error.message : String(error), + }); + await rm(paths.workspacePath, { recursive: true, force: true }); + } finally { + // 确保锁总是被释放 + unlock(); + } + } + + async resolveCommitParent(workspacePath: string, commitSha: string): Promise { + try { + const result = await this.sandboxExec.run('git', ['rev-parse', `${commitSha}^`], { + cwd: workspacePath, + timeoutMs: this.commandTimeoutMs, + }); + return result.stdout.trim() || null; + } catch { + return null; + } + } + + private async pathExists(targetPath: string): Promise { + try { + await access(targetPath); + return true; + } catch { + return false; + } + } +} diff --git a/src/review/context/sandbox-exec.ts b/src/review/context/sandbox-exec.ts new file mode 100644 index 0000000..e9e1002 --- /dev/null +++ b/src/review/context/sandbox-exec.ts @@ -0,0 +1,94 @@ +import { execFile } from 'node:child_process'; + +export interface SandboxCommandResult { + stdout: string; + stderr: string; + durationMs: number; + exitCode: number; +} + +export interface SandboxRunOptions { + cwd: string; + timeoutMs: number; +} + +export class SandboxExec { + private readonly allowedCommands: Set; + + constructor(allowedCommands: string[]) { + this.allowedCommands = new Set(allowedCommands); + } + + /** + * Redact敏感信息(如URLs中的token、git config中的认证header)以防止泄露到日志 + */ + private redactSensitiveArgs(args: string[]): string[] { + return args.map((arg) => { + // Redact git config中的http.extraHeader认证token + if (arg.includes('http.extraHeader=Authorization:')) { + return arg.replace(/(Authorization:\s*token\s+)[^\s]+/i, '$1***'); + } + + try { + // 检测URL格式并redact认证信息 + const url = new URL(arg); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + return url.toString(); + } + } catch { + // 不是URL,保持原样 + } + return arg; + }); + } + + async run(command: string, args: string[], options: SandboxRunOptions): Promise { + if (!this.allowedCommands.has(command)) { + throw new Error(`命令未在白名单中: ${command}`); + } + + const startedAt = Date.now(); + + return new Promise((resolve, reject) => { + execFile( + command, + args, + { + cwd: options.cwd, + timeout: options.timeoutMs, + maxBuffer: 1024 * 1024 * 16, + windowsHide: true, + env: { + PATH: process.env.PATH, + HOME: process.env.HOME, + LANG: process.env.LANG, + LC_ALL: process.env.LC_ALL, + }, + }, + (error, stdout, stderr) => { + const durationMs = Date.now() - startedAt; + if (error) { + const code = typeof error.code === 'number' ? error.code : -1; + // Redact敏感参数(如带token的URLs)以防止凭证泄露到日志和持久化错误 + const redactedArgs = this.redactSensitiveArgs(args); + reject( + new Error( + `命令执行失败: ${command} ${redactedArgs.join(' ')}; code=${code}; stderr=${stderr || '(无 stderr,原始错误已脱敏)'}` + ) + ); + return; + } + + resolve({ + stdout, + stderr, + durationMs, + exitCode: 0, + }); + } + ); + }); + } +}