From 956a84acc11beffb0d64abd4098e6e319ec51ea7 Mon Sep 17 00:00:00 2001 From: accelerator Date: Sun, 1 Mar 2026 03:28:34 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=90=91=E9=87=8F?= =?UTF-8?q?=E8=AE=B0=E5=BF=86=E5=92=8C=E5=AD=A6=E4=B9=A0=E7=B3=BB=E7=BB=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 基于Qdrant的向量存储实现finding相似度搜索;LearningSystem支持误报学习和Few-shot示例生成 Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus --- src/review/learning/learning-system.ts | 210 +++++++++++++++++++++++++ src/review/memory/types.ts | 29 ++++ src/review/memory/vector-store.ts | 206 ++++++++++++++++++++++++ 3 files changed, 445 insertions(+) create mode 100644 src/review/learning/learning-system.ts create mode 100644 src/review/memory/types.ts create mode 100644 src/review/memory/vector-store.ts diff --git a/src/review/learning/learning-system.ts b/src/review/learning/learning-system.ts new file mode 100644 index 0000000..3df065b --- /dev/null +++ b/src/review/learning/learning-system.ts @@ -0,0 +1,210 @@ +import { VectorMemoryStore } from '../memory/vector-store'; +import { FileReviewStore } from '../store/file-review-store'; +import { Finding, FindingCategory } from '../types'; +import { logger } from '../../utils/logger'; +import OpenAI from 'openai'; +import config from '../../config'; + +export class LearningSystem { + constructor( + private memoryStore: VectorMemoryStore, + private store: FileReviewStore + ) {} + + async learnFromFalsePositive( + finding: Finding, + reason: string, + owner: string, + repo: string + ): Promise { + // 存储误报模式到向量记忆 + await this.memoryStore.storeMemory({ + id: `fp-${finding.id}`, + type: 'pattern', + content: `False Positive: ${finding.title}\nReason: ${reason}\nEvidence: ${finding.evidence}\nCategory: ${finding.category}`, + metadata: { + category: finding.category, + approved: false, + timestamp: new Date().toISOString(), + owner, + repo, + project: `${owner}/${repo}`, + }, + }); + + // 查找相似的未发布findings,降低置信度 + const similarFindings = await this.findSimilarPendingFindings(finding); + + for (const similar of similarFindings) { + if (!similar.published && similar.confidence > 0.5) { + const newConfidence = Math.max(similar.confidence - 0.2, 0.3); + await this.store.updateFindingConfidence(similar.id, newConfidence); + + logger.info('从误报中学习,降低相似finding置信度', { + findingId: similar.id, + oldConfidence: similar.confidence, + newConfidence, + }); + } + } + + logger.info('从误报中学习完成', { + findingId: finding.id, + category: finding.category, + updatedSimilar: similarFindings.length, + }); + } + + async generateFewShotExamples( + category: FindingCategory, + owner?: string, + repo?: string + ): Promise { + const targetCount = config.review.fewShotExamplesCount; + + // 提前检查:如果few-shot被禁用(targetCount=0),直接返回,避免无意义的向量查询 + if (targetCount === 0) { + return []; + } + + // 构建过滤条件 + const filter: any = { + must: [{ key: 'category', match: { value: category } }], + }; + + // 如果指定了项目,优先使用该项目的示例 + if (owner && repo) { + filter.must.push({ key: 'project', match: { value: `${owner}/${repo}` } }); + } + + // 使用category名称作为通用查询而非空字符串,避免无意义的embedding调用 + const categoryQuery = `${category} issues in code`; + + // 获取已批准的正样本 + const approvedFilter = { + must: [...filter.must, { key: 'approved', match: { value: true } }], + }; + const approved = await this.memoryStore.searchSimilar(categoryQuery, 10, approvedFilter); + + // 获取误报的负样本 + const rejectedFilter = { + must: [...filter.must, { key: 'approved', match: { value: false } }], + }; + const rejected = await this.memoryStore.searchSimilar(categoryQuery, 5, rejectedFilter); + + // 如果项目内示例不足,补充全局示例 + if (approved.length < targetCount) { + const globalApproved = await this.memoryStore.searchSimilar(categoryQuery, 10, { + must: [ + { key: 'category', match: { value: category } }, + { key: 'approved', match: { value: true } }, + ], + }); + approved.push(...globalApproved.filter((a) => !approved.find((e) => e.entry.id === a.entry.id))); + } + + const examples: OpenAI.Chat.ChatCompletionMessageParam[] = []; + + const negativeCount = Math.floor(targetCount * 0.4); + + // 添加正样本示例 + for (const a of approved.slice(0, targetCount)) { + examples.push({ + role: 'user', + content: `审查这段代码变更,关注${category}相关问题:\n${a.entry.content}`, + }); + examples.push({ + role: 'assistant', + content: JSON.stringify({ + findings: [ + { + title: a.entry.content.split('\n')[0].replace('False Positive: ', ''), + category, + severity: a.entry.metadata.severity || 'medium', + valid: true, + }, + ], + }), + }); + } + + // 添加负样本示例(误报) + for (const r of rejected.slice(0, negativeCount)) { + examples.push({ + role: 'user', + content: `审查这段代码变更,关注${category}相关问题:\n${r.entry.content}`, + }); + examples.push({ + role: 'assistant', + content: JSON.stringify({ + findings: [], + reason: '历史反馈表明这类情况不应报告为问题', + }), + }); + } + + logger.debug('生成Few-shot示例', { + category, + positiveExamples: approved.length, + negativeExamples: rejected.length, + totalMessages: examples.length, + }); + + return examples; + } + + private async findSimilarPendingFindings(_finding: Finding): Promise { + // 这里简化实现,实际应该查询数据库中相似的findings + // 由于FileReviewStore没有这个方法,我们暂时返回空数组 + // 在实际部署时需要扩展FileReviewStore + return []; + } + + async learnFromApproval( + finding: Finding, + _owner: string, + _repo: string + ): Promise { + // 将已批准的finding存储为正样本 + await this.memoryStore.storeFinding(finding, true, _owner, _repo); + + logger.info('从批准中学习完成', { + findingId: finding.id, + category: finding.category, + severity: finding.severity, + }); + } + + async getConfidenceAdjustment( + finding: Omit, + owner: string, + repo: string + ): Promise { + // 搜索相似的误报(优先同一项目) + const query = `${finding.title}\n${finding.evidence}`; + const similarFalsePositives = await this.memoryStore.searchSimilar(query, 3, { + must: [ + { key: 'type', match: { value: 'pattern' } }, + { key: 'category', match: { value: finding.category } }, + { key: 'project', match: { value: `${owner}/${repo}` } }, + ], + }); + + if (similarFalsePositives.length === 0) { + return 0; // 无需调整 + } + + // 根据相似度计算置信度惩罚 + const maxSimilarity = Math.max(...similarFalsePositives.map((fp) => fp.score)); + + if (maxSimilarity > 0.9) { + return -0.3; // 高度相似的误报,大幅降低置信度 + } else if (maxSimilarity > 0.8) { + return -0.15; // 中度相似,适度降低 + } else if (maxSimilarity > 0.7) { + return -0.05; // 低度相似,略微降低 + } + + return 0; + } +} diff --git a/src/review/memory/types.ts b/src/review/memory/types.ts new file mode 100644 index 0000000..45439a9 --- /dev/null +++ b/src/review/memory/types.ts @@ -0,0 +1,29 @@ +export interface MemoryEntry { + id: string; + type: 'finding' | 'feedback' | 'pattern'; + content: string; + embedding?: number[]; + metadata: { + category?: string; + severity?: string; + approved?: boolean; + timestamp: string; + project?: string; + owner?: string; + repo?: string; + }; +} + +export interface MemorySearchResult { + entry: MemoryEntry; + score: number; + distance: number; +} + +export interface FeedbackRecord { + findingId: string; + approved: boolean; + reason: string; + timestamp: string; + reviewer?: string; +} diff --git a/src/review/memory/vector-store.ts b/src/review/memory/vector-store.ts new file mode 100644 index 0000000..1a9d41f --- /dev/null +++ b/src/review/memory/vector-store.ts @@ -0,0 +1,206 @@ +import { QdrantClient } from '@qdrant/js-client-rest'; +import OpenAI from 'openai'; +import { MemoryEntry, MemorySearchResult } from './types'; +import { Finding } from '../types'; +import { logger } from '../../utils/logger'; + +export class VectorMemoryStore { + private client: QdrantClient; + private openai: OpenAI; + private collectionName = 'code_review_memory'; + private initialized = false; + + constructor(qdrantUrl: string, openaiClient: OpenAI) { + this.client = new QdrantClient({ url: qdrantUrl }); + this.openai = openaiClient; + } + + async initialize(): Promise { + if (this.initialized) { + return; + } + + try { + const collections = await this.client.getCollections(); + const exists = collections.collections.some((c) => c.name === this.collectionName); + + if (!exists) { + await this.client.createCollection(this.collectionName, { + vectors: { + size: 1536, // text-embedding-3-small dimension + distance: 'Cosine', + }, + }); + logger.info('向量记忆集合已创建', { collection: this.collectionName }); + } + + this.initialized = true; + logger.info('向量记忆系统已初始化'); + } catch (error) { + logger.error('向量记忆系统初始化失败', { + error: error instanceof Error ? error.message : String(error), + }); + throw error; + } + } + + async storeMemory(entry: MemoryEntry): Promise { + await this.initialize(); + + const embedding = await this.getEmbedding(entry.content); + + await this.client.upsert(this.collectionName, { + points: [ + { + id: entry.id, + vector: embedding, + payload: { + type: entry.type, + content: entry.content, + ...entry.metadata, + }, + }, + ], + }); + + logger.debug('记忆已存储', { + id: entry.id, + type: entry.type, + category: entry.metadata.category, + }); + } + + async searchSimilar( + query: string, + limit: number = 5, + filter?: any + ): Promise { + await this.initialize(); + + const queryEmbedding = await this.getEmbedding(query); + + const results = await this.client.search(this.collectionName, { + vector: queryEmbedding, + limit, + filter, + }); + + return results.map((r) => ({ + entry: { + id: String(r.id), + type: r.payload?.type as any, + content: r.payload?.content as string, + metadata: { + category: r.payload?.category as string, + severity: r.payload?.severity as string, + approved: r.payload?.approved as boolean, + timestamp: r.payload?.timestamp as string, + project: r.payload?.project as string, + owner: r.payload?.owner as string, + repo: r.payload?.repo as string, + }, + }, + score: r.score, + distance: 1 - r.score, + })); + } + + private async getEmbedding(text: string): Promise { + try { + const response = await this.openai.embeddings.create({ + model: 'text-embedding-3-small', + input: text.slice(0, 8000), // 限制长度防止超出token限制 + }); + + return response.data[0].embedding; + } catch (error) { + logger.error('生成embedding失败', { + error: error instanceof Error ? error.message : String(error), + }); + throw error; + } + } + + async storeFinding(finding: Finding, approved: boolean, owner: string, repo: string): Promise { + const content = `${finding.title}\n${finding.detail}\nEvidence: ${finding.evidence}`; + + // 使用repo-scoped ID防止不同仓库的findings相互覆盖 + const scopedId = `${owner}/${repo}:${finding.fingerprint}`; + + await this.storeMemory({ + id: scopedId, + type: 'finding', + content, + metadata: { + category: finding.category, + severity: finding.severity, + approved, + timestamp: new Date().toISOString(), + owner, + repo, + project: `${owner}/${repo}`, + }, + }); + } + + async getHistoricalContext( + currentFinding: Partial, + owner: string, + repo: string + ): Promise { + const query = `${currentFinding.title}\n${currentFinding.evidence || ''}`; + + // 优先搜索同一项目的相似问题 + const projectSimilar = await this.searchSimilar(query, 2, { + must: [ + { key: 'approved', match: { value: true } }, + { key: 'project', match: { value: `${owner}/${repo}` } }, + ], + }); + + // 如果项目内没有足够相似问题,搜索全局 + let similar = projectSimilar; + if (similar.length < 2) { + const globalSimilar = await this.searchSimilar(query, 3, { + must: [{ key: 'approved', match: { value: true } }], + }); + similar = [...projectSimilar, ...globalSimilar].slice(0, 3); + } + + if (similar.length === 0) { + return ''; + } + + return `\n\n历史相似问题参考:\n${similar + .map( + (s, i) => + `${i + 1}. ${s.entry.content.split('\n')[0]} (相似度: ${(s.score * 100).toFixed(1)}%, 项目: ${ + s.entry.metadata.project || '未知' + })` + ) + .join('\n')}`; + } + + async storeFeedback( + findingId: string, + approved: boolean, + reason: string, + owner: string, + repo: string + ): Promise { + const content = `Feedback: ${approved ? 'Approved' : 'Rejected'}\nReason: ${reason}\nFinding ID: ${findingId}`; + + await this.storeMemory({ + id: `feedback-${findingId}-${Date.now()}`, + type: 'feedback', + content, + metadata: { + approved, + timestamp: new Date().toISOString(), + owner, + repo, + project: `${owner}/${repo}`, + }, + }); + } +}