mirror of
https://github.com/jeffusion/gitea-ai-assistant.git
synced 2026-05-31 23:16:46 +00:00
feat: 添加向量记忆和学习系统
基于Qdrant的向量存储实现finding相似度搜索;LearningSystem支持误报学习和Few-shot示例生成 Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
210
src/review/learning/learning-system.ts
Normal file
210
src/review/learning/learning-system.ts
Normal file
@@ -0,0 +1,210 @@
|
||||
import { VectorMemoryStore } from '../memory/vector-store';
|
||||
import { FileReviewStore } from '../store/file-review-store';
|
||||
import { Finding, FindingCategory } from '../types';
|
||||
import { logger } from '../../utils/logger';
|
||||
import OpenAI from 'openai';
|
||||
import config from '../../config';
|
||||
|
||||
export class LearningSystem {
|
||||
constructor(
|
||||
private memoryStore: VectorMemoryStore,
|
||||
private store: FileReviewStore
|
||||
) {}
|
||||
|
||||
async learnFromFalsePositive(
|
||||
finding: Finding,
|
||||
reason: string,
|
||||
owner: string,
|
||||
repo: string
|
||||
): Promise<void> {
|
||||
// 存储误报模式到向量记忆
|
||||
await this.memoryStore.storeMemory({
|
||||
id: `fp-${finding.id}`,
|
||||
type: 'pattern',
|
||||
content: `False Positive: ${finding.title}\nReason: ${reason}\nEvidence: ${finding.evidence}\nCategory: ${finding.category}`,
|
||||
metadata: {
|
||||
category: finding.category,
|
||||
approved: false,
|
||||
timestamp: new Date().toISOString(),
|
||||
owner,
|
||||
repo,
|
||||
project: `${owner}/${repo}`,
|
||||
},
|
||||
});
|
||||
|
||||
// 查找相似的未发布findings,降低置信度
|
||||
const similarFindings = await this.findSimilarPendingFindings(finding);
|
||||
|
||||
for (const similar of similarFindings) {
|
||||
if (!similar.published && similar.confidence > 0.5) {
|
||||
const newConfidence = Math.max(similar.confidence - 0.2, 0.3);
|
||||
await this.store.updateFindingConfidence(similar.id, newConfidence);
|
||||
|
||||
logger.info('从误报中学习,降低相似finding置信度', {
|
||||
findingId: similar.id,
|
||||
oldConfidence: similar.confidence,
|
||||
newConfidence,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('从误报中学习完成', {
|
||||
findingId: finding.id,
|
||||
category: finding.category,
|
||||
updatedSimilar: similarFindings.length,
|
||||
});
|
||||
}
|
||||
|
||||
async generateFewShotExamples(
|
||||
category: FindingCategory,
|
||||
owner?: string,
|
||||
repo?: string
|
||||
): Promise<OpenAI.Chat.ChatCompletionMessageParam[]> {
|
||||
const targetCount = config.review.fewShotExamplesCount;
|
||||
|
||||
// 提前检查:如果few-shot被禁用(targetCount=0),直接返回,避免无意义的向量查询
|
||||
if (targetCount === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// 构建过滤条件
|
||||
const filter: any = {
|
||||
must: [{ key: 'category', match: { value: category } }],
|
||||
};
|
||||
|
||||
// 如果指定了项目,优先使用该项目的示例
|
||||
if (owner && repo) {
|
||||
filter.must.push({ key: 'project', match: { value: `${owner}/${repo}` } });
|
||||
}
|
||||
|
||||
// 使用category名称作为通用查询而非空字符串,避免无意义的embedding调用
|
||||
const categoryQuery = `${category} issues in code`;
|
||||
|
||||
// 获取已批准的正样本
|
||||
const approvedFilter = {
|
||||
must: [...filter.must, { key: 'approved', match: { value: true } }],
|
||||
};
|
||||
const approved = await this.memoryStore.searchSimilar(categoryQuery, 10, approvedFilter);
|
||||
|
||||
// 获取误报的负样本
|
||||
const rejectedFilter = {
|
||||
must: [...filter.must, { key: 'approved', match: { value: false } }],
|
||||
};
|
||||
const rejected = await this.memoryStore.searchSimilar(categoryQuery, 5, rejectedFilter);
|
||||
|
||||
// 如果项目内示例不足,补充全局示例
|
||||
if (approved.length < targetCount) {
|
||||
const globalApproved = await this.memoryStore.searchSimilar(categoryQuery, 10, {
|
||||
must: [
|
||||
{ key: 'category', match: { value: category } },
|
||||
{ key: 'approved', match: { value: true } },
|
||||
],
|
||||
});
|
||||
approved.push(...globalApproved.filter((a) => !approved.find((e) => e.entry.id === a.entry.id)));
|
||||
}
|
||||
|
||||
const examples: OpenAI.Chat.ChatCompletionMessageParam[] = [];
|
||||
|
||||
const negativeCount = Math.floor(targetCount * 0.4);
|
||||
|
||||
// 添加正样本示例
|
||||
for (const a of approved.slice(0, targetCount)) {
|
||||
examples.push({
|
||||
role: 'user',
|
||||
content: `审查这段代码变更,关注${category}相关问题:\n${a.entry.content}`,
|
||||
});
|
||||
examples.push({
|
||||
role: 'assistant',
|
||||
content: JSON.stringify({
|
||||
findings: [
|
||||
{
|
||||
title: a.entry.content.split('\n')[0].replace('False Positive: ', ''),
|
||||
category,
|
||||
severity: a.entry.metadata.severity || 'medium',
|
||||
valid: true,
|
||||
},
|
||||
],
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
// 添加负样本示例(误报)
|
||||
for (const r of rejected.slice(0, negativeCount)) {
|
||||
examples.push({
|
||||
role: 'user',
|
||||
content: `审查这段代码变更,关注${category}相关问题:\n${r.entry.content}`,
|
||||
});
|
||||
examples.push({
|
||||
role: 'assistant',
|
||||
content: JSON.stringify({
|
||||
findings: [],
|
||||
reason: '历史反馈表明这类情况不应报告为问题',
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
logger.debug('生成Few-shot示例', {
|
||||
category,
|
||||
positiveExamples: approved.length,
|
||||
negativeExamples: rejected.length,
|
||||
totalMessages: examples.length,
|
||||
});
|
||||
|
||||
return examples;
|
||||
}
|
||||
|
||||
private async findSimilarPendingFindings(_finding: Finding): Promise<Finding[]> {
|
||||
// 这里简化实现,实际应该查询数据库中相似的findings
|
||||
// 由于FileReviewStore没有这个方法,我们暂时返回空数组
|
||||
// 在实际部署时需要扩展FileReviewStore
|
||||
return [];
|
||||
}
|
||||
|
||||
async learnFromApproval(
|
||||
finding: Finding,
|
||||
_owner: string,
|
||||
_repo: string
|
||||
): Promise<void> {
|
||||
// 将已批准的finding存储为正样本
|
||||
await this.memoryStore.storeFinding(finding, true, _owner, _repo);
|
||||
|
||||
logger.info('从批准中学习完成', {
|
||||
findingId: finding.id,
|
||||
category: finding.category,
|
||||
severity: finding.severity,
|
||||
});
|
||||
}
|
||||
|
||||
async getConfidenceAdjustment(
|
||||
finding: Omit<Finding, 'id' | 'runId' | 'published'>,
|
||||
owner: string,
|
||||
repo: string
|
||||
): Promise<number> {
|
||||
// 搜索相似的误报(优先同一项目)
|
||||
const query = `${finding.title}\n${finding.evidence}`;
|
||||
const similarFalsePositives = await this.memoryStore.searchSimilar(query, 3, {
|
||||
must: [
|
||||
{ key: 'type', match: { value: 'pattern' } },
|
||||
{ key: 'category', match: { value: finding.category } },
|
||||
{ key: 'project', match: { value: `${owner}/${repo}` } },
|
||||
],
|
||||
});
|
||||
|
||||
if (similarFalsePositives.length === 0) {
|
||||
return 0; // 无需调整
|
||||
}
|
||||
|
||||
// 根据相似度计算置信度惩罚
|
||||
const maxSimilarity = Math.max(...similarFalsePositives.map((fp) => fp.score));
|
||||
|
||||
if (maxSimilarity > 0.9) {
|
||||
return -0.3; // 高度相似的误报,大幅降低置信度
|
||||
} else if (maxSimilarity > 0.8) {
|
||||
return -0.15; // 中度相似,适度降低
|
||||
} else if (maxSimilarity > 0.7) {
|
||||
return -0.05; // 低度相似,略微降低
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
29
src/review/memory/types.ts
Normal file
29
src/review/memory/types.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
export interface MemoryEntry {
|
||||
id: string;
|
||||
type: 'finding' | 'feedback' | 'pattern';
|
||||
content: string;
|
||||
embedding?: number[];
|
||||
metadata: {
|
||||
category?: string;
|
||||
severity?: string;
|
||||
approved?: boolean;
|
||||
timestamp: string;
|
||||
project?: string;
|
||||
owner?: string;
|
||||
repo?: string;
|
||||
};
|
||||
}
|
||||
|
||||
export interface MemorySearchResult {
|
||||
entry: MemoryEntry;
|
||||
score: number;
|
||||
distance: number;
|
||||
}
|
||||
|
||||
export interface FeedbackRecord {
|
||||
findingId: string;
|
||||
approved: boolean;
|
||||
reason: string;
|
||||
timestamp: string;
|
||||
reviewer?: string;
|
||||
}
|
||||
206
src/review/memory/vector-store.ts
Normal file
206
src/review/memory/vector-store.ts
Normal file
@@ -0,0 +1,206 @@
|
||||
import { QdrantClient } from '@qdrant/js-client-rest';
|
||||
import OpenAI from 'openai';
|
||||
import { MemoryEntry, MemorySearchResult } from './types';
|
||||
import { Finding } from '../types';
|
||||
import { logger } from '../../utils/logger';
|
||||
|
||||
export class VectorMemoryStore {
|
||||
private client: QdrantClient;
|
||||
private openai: OpenAI;
|
||||
private collectionName = 'code_review_memory';
|
||||
private initialized = false;
|
||||
|
||||
constructor(qdrantUrl: string, openaiClient: OpenAI) {
|
||||
this.client = new QdrantClient({ url: qdrantUrl });
|
||||
this.openai = openaiClient;
|
||||
}
|
||||
|
||||
async initialize(): Promise<void> {
|
||||
if (this.initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const collections = await this.client.getCollections();
|
||||
const exists = collections.collections.some((c) => c.name === this.collectionName);
|
||||
|
||||
if (!exists) {
|
||||
await this.client.createCollection(this.collectionName, {
|
||||
vectors: {
|
||||
size: 1536, // text-embedding-3-small dimension
|
||||
distance: 'Cosine',
|
||||
},
|
||||
});
|
||||
logger.info('向量记忆集合已创建', { collection: this.collectionName });
|
||||
}
|
||||
|
||||
this.initialized = true;
|
||||
logger.info('向量记忆系统已初始化');
|
||||
} catch (error) {
|
||||
logger.error('向量记忆系统初始化失败', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async storeMemory(entry: MemoryEntry): Promise<void> {
|
||||
await this.initialize();
|
||||
|
||||
const embedding = await this.getEmbedding(entry.content);
|
||||
|
||||
await this.client.upsert(this.collectionName, {
|
||||
points: [
|
||||
{
|
||||
id: entry.id,
|
||||
vector: embedding,
|
||||
payload: {
|
||||
type: entry.type,
|
||||
content: entry.content,
|
||||
...entry.metadata,
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
logger.debug('记忆已存储', {
|
||||
id: entry.id,
|
||||
type: entry.type,
|
||||
category: entry.metadata.category,
|
||||
});
|
||||
}
|
||||
|
||||
async searchSimilar(
|
||||
query: string,
|
||||
limit: number = 5,
|
||||
filter?: any
|
||||
): Promise<MemorySearchResult[]> {
|
||||
await this.initialize();
|
||||
|
||||
const queryEmbedding = await this.getEmbedding(query);
|
||||
|
||||
const results = await this.client.search(this.collectionName, {
|
||||
vector: queryEmbedding,
|
||||
limit,
|
||||
filter,
|
||||
});
|
||||
|
||||
return results.map((r) => ({
|
||||
entry: {
|
||||
id: String(r.id),
|
||||
type: r.payload?.type as any,
|
||||
content: r.payload?.content as string,
|
||||
metadata: {
|
||||
category: r.payload?.category as string,
|
||||
severity: r.payload?.severity as string,
|
||||
approved: r.payload?.approved as boolean,
|
||||
timestamp: r.payload?.timestamp as string,
|
||||
project: r.payload?.project as string,
|
||||
owner: r.payload?.owner as string,
|
||||
repo: r.payload?.repo as string,
|
||||
},
|
||||
},
|
||||
score: r.score,
|
||||
distance: 1 - r.score,
|
||||
}));
|
||||
}
|
||||
|
||||
private async getEmbedding(text: string): Promise<number[]> {
|
||||
try {
|
||||
const response = await this.openai.embeddings.create({
|
||||
model: 'text-embedding-3-small',
|
||||
input: text.slice(0, 8000), // 限制长度防止超出token限制
|
||||
});
|
||||
|
||||
return response.data[0].embedding;
|
||||
} catch (error) {
|
||||
logger.error('生成embedding失败', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async storeFinding(finding: Finding, approved: boolean, owner: string, repo: string): Promise<void> {
|
||||
const content = `${finding.title}\n${finding.detail}\nEvidence: ${finding.evidence}`;
|
||||
|
||||
// 使用repo-scoped ID防止不同仓库的findings相互覆盖
|
||||
const scopedId = `${owner}/${repo}:${finding.fingerprint}`;
|
||||
|
||||
await this.storeMemory({
|
||||
id: scopedId,
|
||||
type: 'finding',
|
||||
content,
|
||||
metadata: {
|
||||
category: finding.category,
|
||||
severity: finding.severity,
|
||||
approved,
|
||||
timestamp: new Date().toISOString(),
|
||||
owner,
|
||||
repo,
|
||||
project: `${owner}/${repo}`,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
async getHistoricalContext(
|
||||
currentFinding: Partial<Finding>,
|
||||
owner: string,
|
||||
repo: string
|
||||
): Promise<string> {
|
||||
const query = `${currentFinding.title}\n${currentFinding.evidence || ''}`;
|
||||
|
||||
// 优先搜索同一项目的相似问题
|
||||
const projectSimilar = await this.searchSimilar(query, 2, {
|
||||
must: [
|
||||
{ key: 'approved', match: { value: true } },
|
||||
{ key: 'project', match: { value: `${owner}/${repo}` } },
|
||||
],
|
||||
});
|
||||
|
||||
// 如果项目内没有足够相似问题,搜索全局
|
||||
let similar = projectSimilar;
|
||||
if (similar.length < 2) {
|
||||
const globalSimilar = await this.searchSimilar(query, 3, {
|
||||
must: [{ key: 'approved', match: { value: true } }],
|
||||
});
|
||||
similar = [...projectSimilar, ...globalSimilar].slice(0, 3);
|
||||
}
|
||||
|
||||
if (similar.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return `\n\n历史相似问题参考:\n${similar
|
||||
.map(
|
||||
(s, i) =>
|
||||
`${i + 1}. ${s.entry.content.split('\n')[0]} (相似度: ${(s.score * 100).toFixed(1)}%, 项目: ${
|
||||
s.entry.metadata.project || '未知'
|
||||
})`
|
||||
)
|
||||
.join('\n')}`;
|
||||
}
|
||||
|
||||
async storeFeedback(
|
||||
findingId: string,
|
||||
approved: boolean,
|
||||
reason: string,
|
||||
owner: string,
|
||||
repo: string
|
||||
): Promise<void> {
|
||||
const content = `Feedback: ${approved ? 'Approved' : 'Rejected'}\nReason: ${reason}\nFinding ID: ${findingId}`;
|
||||
|
||||
await this.storeMemory({
|
||||
id: `feedback-${findingId}-${Date.now()}`,
|
||||
type: 'feedback',
|
||||
content,
|
||||
metadata: {
|
||||
approved,
|
||||
timestamp: new Date().toISOString(),
|
||||
owner,
|
||||
repo,
|
||||
project: `${owner}/${repo}`,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user