diff --git a/electron/main.ts b/electron/main.ts index 15ae142..8cade14 100644 --- a/electron/main.ts +++ b/electron/main.ts @@ -442,6 +442,9 @@ function registerIpcHandlers() { ipcMain.handle('chat:getVoiceData', async (_, sessionId: string, msgId: string, createTime?: number, serverId?: string | number) => { return chatService.getVoiceData(sessionId, msgId, createTime, serverId) }) + ipcMain.handle('chat:resolveVoiceCache', async (_, sessionId: string, msgId: string) => { + return chatService.resolveVoiceCache(sessionId, msgId) + }) ipcMain.handle('chat:getVoiceTranscript', async (event, sessionId: string, msgId: string) => { return chatService.getVoiceTranscript(sessionId, msgId, (text) => { diff --git a/electron/preload.ts b/electron/preload.ts index 3c50915..775e19a 100644 --- a/electron/preload.ts +++ b/electron/preload.ts @@ -108,6 +108,7 @@ contextBridge.exposeInMainWorld('electronAPI', { getImageData: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:getImageData', sessionId, msgId), getVoiceData: (sessionId: string, msgId: string, createTime?: number, serverId?: string | number) => ipcRenderer.invoke('chat:getVoiceData', sessionId, msgId, createTime, serverId), + resolveVoiceCache: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:resolveVoiceCache', sessionId, msgId), getVoiceTranscript: (sessionId: string, msgId: string) => ipcRenderer.invoke('chat:getVoiceTranscript', sessionId, msgId), onVoiceTranscriptPartial: (callback: (payload: { msgId: string; text: string }) => void) => { const listener = (_: any, payload: { msgId: string; text: string }) => callback(payload) diff --git a/electron/services/chatService.ts b/electron/services/chatService.ts index cf8679d..cfe251b 100644 --- a/electron/services/chatService.ts +++ b/electron/services/chatService.ts @@ -2202,7 +2202,7 @@ class ChatService { // 3. 调用 C++ 接口获取语音 (Hex) - const voiceRes = await wcdbService.getVoiceData(sessionId, msgCreateTime, candidates, msgSvrId) + const voiceRes = await wcdbService.getVoiceData(sessionId, msgCreateTime, candidates, localId, msgSvrId) if (!voiceRes.success || !voiceRes.hex) { return { success: false, error: voiceRes.error || '未找到语音数据' } } @@ -2245,6 +2245,33 @@ class ChatService { } } + /** + * 检查语音是否已有缓存 + */ + async resolveVoiceCache(sessionId: string, msgId: string): Promise<{ success: boolean; hasCache: boolean; data?: string }> { + try { + const cacheKey = this.getVoiceCacheKey(sessionId, msgId) + + // 1. 检查内存缓存 + const inMemory = this.voiceWavCache.get(cacheKey) + if (inMemory) { + return { success: true, hasCache: true, data: inMemory.toString('base64') } + } + + // 2. 检查文件缓存 + const cachedFile = this.getVoiceCacheFilePath(cacheKey) + if (existsSync(cachedFile)) { + const wavData = readFileSync(cachedFile) + this.cacheVoiceWav(cacheKey, wavData) // 回甜内存 + return { success: true, hasCache: true, data: wavData.toString('base64') } + } + + return { success: true, hasCache: false } + } catch (e) { + return { success: false, hasCache: false } + } + } + async getVoiceData_Legacy(sessionId: string, msgId: string): Promise<{ success: boolean; data?: string; error?: string }> { try { const localId = parseInt(msgId, 10) diff --git a/electron/services/voiceTranscribeService.ts b/electron/services/voiceTranscribeService.ts index 4af6561..a23d0d2 100644 --- a/electron/services/voiceTranscribeService.ts +++ b/electron/services/voiceTranscribeService.ts @@ -14,7 +14,6 @@ type ModelInfo = { files: { model: string tokens: string - vad: string } sizeBytes: number sizeLabel: string @@ -31,8 +30,7 @@ const SENSEVOICE_MODEL: ModelInfo = { name: 'SenseVoiceSmall', files: { model: 'model.int8.onnx', - tokens: 'tokens.txt', - vad: 'silero_vad.onnx' + tokens: 'tokens.txt' }, sizeBytes: 245_000_000, sizeLabel: '245 MB' @@ -40,8 +38,7 @@ const SENSEVOICE_MODEL: ModelInfo = { const MODEL_DOWNLOAD_URLS = { model: 'https://modelscope.cn/models/pengzhendong/sherpa-onnx-sense-voice-zh-en-ja-ko-yue/resolve/master/model.int8.onnx', - tokens: 'https://modelscope.cn/models/pengzhendong/sherpa-onnx-sense-voice-zh-en-ja-ko-yue/resolve/master/tokens.txt', - vad: 'https://www.modelscope.cn/models/manyeyes/silero-vad-onnx/resolve/master/silero_vad.onnx' + tokens: 'https://modelscope.cn/models/pengzhendong/sherpa-onnx-sense-voice-zh-en-ja-ko-yue/resolve/master/tokens.txt' } export class VoiceTranscribeService { @@ -74,12 +71,9 @@ export class VoiceTranscribeService { try { const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model) const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens) - const vadPath = this.resolveModelPath((SENSEVOICE_MODEL.files as any).vad) - const modelExists = existsSync(modelPath) const tokensExists = existsSync(tokensPath) - const vadExists = existsSync(vadPath) - const exists = modelExists && tokensExists && vadExists + const exists = modelExists && tokensExists if (!exists) { return { success: true, exists: false, modelPath, tokensPath } @@ -87,8 +81,7 @@ export class VoiceTranscribeService { const modelSize = statSync(modelPath).size const tokensSize = statSync(tokensPath).size - const vadSize = statSync(vadPath).size - const totalSize = modelSize + tokensSize + vadSize + const totalSize = modelSize + tokensSize return { success: true, @@ -121,7 +114,6 @@ export class VoiceTranscribeService { const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model) const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens) - const vadPath = this.resolveModelPath((SENSEVOICE_MODEL.files as any).vad) // 初始进度 onProgress?.({ @@ -166,35 +158,16 @@ export class VoiceTranscribeService { } ) - // 下载 vad 文件 (30%) - console.info('[VoiceTranscribe] 开始下载 VAD 文件...') - await this.downloadToFile( - (MODEL_DOWNLOAD_URLS as any).vad, - vadPath, - 'vad', - (downloaded, total) => { - const modelSize = existsSync(modelPath) ? statSync(modelPath).size : 0 - const tokensSize = existsSync(tokensPath) ? statSync(tokensPath).size : 0 - const percent = total ? 70 + (downloaded / total) * 30 : 70 - onProgress?.({ - modelName: SENSEVOICE_MODEL.name, - downloadedBytes: modelSize + tokensSize + downloaded, - totalBytes: SENSEVOICE_MODEL.sizeBytes, - percent - }) - } - ) + console.info('[VoiceTranscribe] 模型下载完成') console.info('[VoiceTranscribe] 所有文件下载完成') return { success: true, modelPath, tokensPath } } catch (error) { const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model) const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens) - const vadPath = this.resolveModelPath((SENSEVOICE_MODEL.files as any).vad) try { if (existsSync(modelPath)) unlinkSync(modelPath) if (existsSync(tokensPath)) unlinkSync(tokensPath) - if (existsSync(vadPath)) unlinkSync(vadPath) } catch { } return { success: false, error: String(error) } } finally { @@ -230,7 +203,7 @@ export class VoiceTranscribeService { supportedLanguages = this.configService.get('transcribeLanguages') // 如果配置中也没有或为空,使用默认值 if (!supportedLanguages || supportedLanguages.length === 0) { - supportedLanguages = ['zh'] + supportedLanguages = ['zh', 'yue'] } } @@ -303,7 +276,7 @@ export class VoiceTranscribeService { const request = protocol.get(url, options, (response) => { console.info(`[VoiceTranscribe] ${fileName} 响应状态:`, response.statusCode) - + // 处理重定向 if ([301, 302, 303, 307, 308].includes(response.statusCode || 0) && response.headers.location) { if (remainingRedirects <= 0) { @@ -324,11 +297,11 @@ export class VoiceTranscribeService { const totalBytes = Number(response.headers['content-length'] || 0) || undefined let downloadedBytes = 0 - + console.info(`[VoiceTranscribe] ${fileName} 文件大小:`, totalBytes ? `${(totalBytes / 1024 / 1024).toFixed(2)} MB` : '未知') const writer = createWriteStream(targetPath) - + // 设置数据接收超时(60秒没有数据则超时) let lastDataTime = Date.now() const dataTimeout = setInterval(() => { @@ -392,7 +365,7 @@ export class VoiceTranscribeService { // sherpa-onnx 的 recognizer 可能需要手动释放 this.recognizer = null } catch (error) { - } + } } } } diff --git a/electron/services/wcdbCore.ts b/electron/services/wcdbCore.ts index 0f19831..1949e74 100644 --- a/electron/services/wcdbCore.ts +++ b/electron/services/wcdbCore.ts @@ -347,9 +347,9 @@ export class WcdbCore { this.wcdbGetDbStatus = null } - // wcdb_status wcdb_get_voice_data(wcdb_handle handle, const char* session_id, int32_t create_time, const char* candidates_json, char** out_hex) + // wcdb_status wcdb_get_voice_data(wcdb_handle handle, const char* session_id, int32_t create_time, int32_t local_id, int64_t svr_id, const char* candidates_json, char** out_hex) try { - this.wcdbGetVoiceData = this.lib.func('int32 wcdb_get_voice_data(int64 handle, const char* sessionId, int32 createTime, int64 svrId, const char* candidatesJson, _Out_ void** outHex)') + this.wcdbGetVoiceData = this.lib.func('int32 wcdb_get_voice_data(int64 handle, const char* sessionId, int32 createTime, int32 localId, int64 svrId, const char* candidatesJson, _Out_ void** outHex)') } catch { this.wcdbGetVoiceData = null } @@ -1321,12 +1321,12 @@ export class WcdbCore { } } - async getVoiceData(sessionId: string, createTime: number, candidates: string[], svrId: string | number = 0): Promise<{ success: boolean; hex?: string; error?: string }> { + async getVoiceData(sessionId: string, createTime: number, candidates: string[], localId: number = 0, svrId: string | number = 0): Promise<{ success: boolean; hex?: string; error?: string }> { if (!this.ensureReady()) return { success: false, error: 'WCDB 未连接' } if (!this.wcdbGetVoiceData) return { success: false, error: '当前 DLL 版本不支持获取语音数据' } try { const outPtr = [null as any] - const result = this.wcdbGetVoiceData(this.handle, sessionId, createTime, BigInt(svrId || 0), JSON.stringify(candidates), outPtr) + const result = this.wcdbGetVoiceData(this.handle, sessionId, createTime, localId, BigInt(svrId || 0), JSON.stringify(candidates), outPtr) if (result !== 0 || !outPtr[0]) { return { success: false, error: `获取语音数据失败: ${result}` } } diff --git a/electron/services/wcdbService.ts b/electron/services/wcdbService.ts index 7f750c3..7628c67 100644 --- a/electron/services/wcdbService.ts +++ b/electron/services/wcdbService.ts @@ -99,7 +99,7 @@ export class WcdbService { setPaths(resourcesPath: string, userDataPath: string): void { this.resourcesPath = resourcesPath this.userDataPath = userDataPath - this.callWorker('setPaths', { resourcesPath, userDataPath }).catch(() => {}) + this.callWorker('setPaths', { resourcesPath, userDataPath }).catch(() => { }) } /** @@ -107,7 +107,7 @@ export class WcdbService { */ setLogEnabled(enabled: boolean): void { this.logEnabled = enabled - this.callWorker('setLogEnabled', { enabled }).catch(() => {}) + this.callWorker('setLogEnabled', { enabled }).catch(() => { }) } /** @@ -346,8 +346,8 @@ export class WcdbService { /** * 获取语音数据 */ - async getVoiceData(sessionId: string, createTime: number, candidates: string[], svrId: string | number = 0): Promise<{ success: boolean; hex?: string; error?: string }> { - return this.callWorker('getVoiceData', { sessionId, createTime, candidates, svrId }) + async getVoiceData(sessionId: string, createTime: number, candidates: string[], localId: number = 0, svrId: string | number = 0): Promise<{ success: boolean; hex?: string; error?: string }> { + return this.callWorker('getVoiceData', { sessionId, createTime, candidates, localId, svrId }) } } diff --git a/electron/transcribeWorker.ts b/electron/transcribeWorker.ts index f078d04..6e353a0 100644 --- a/electron/transcribeWorker.ts +++ b/electron/transcribeWorker.ts @@ -1,5 +1,4 @@ import { parentPort, workerData } from 'worker_threads' -import * as fs from 'fs' interface WorkerParams { modelPath: string @@ -18,16 +17,66 @@ const LANGUAGE_TAGS: Record = { 'yue': '<|yue|>' // 粤语 } +// 技术标签(识别语言、语速、ITN等),需要从最终文本中移除 +const TECH_TAGS = [ + '<|zh|>', '<|en|>', '<|ja|>', '<|ko|>', '<|yue|>', + '<|nospeech|>', '<|speech|>', + '<|itn|>', '<|wo_itn|>', + '<|NORMAL|>' +] + +// 情感与事件标签映射,转换为直观的 Emoji +const RICH_TAG_MAP: Record = { + '<|HAPPY|>': '😊', + '<|SAD|>': '😔', + '<|ANGRY|>': '😠', + '<|NEUTRAL|>': '', // 中性情感不特别标记 + '<|FEARFUL|>': '😨', + '<|DISGUSTED|>': '🤢', + '<|SURPRISED|>': '😮', + '<|BGM|>': '🎵', + '<|Applause|>': '👏', + '<|Laughter|>': '😂', + '<|Cry|>': '😭', + '<|Cough|>': ' (咳嗽) ', + '<|Sneeze|>': ' (喷嚏) ', +} + +/** + * 富文本后处理:移除技术标签,转换识别出的情感和声音事件 + */ +function richTranscribePostProcess(text: string): string { + if (!text) return '' + + let processed = text + + // 1. 转换情感和事件标签 + for (const [tag, replacement] of Object.entries(RICH_TAG_MAP)) { + // 使用正则全局替换,不区分大小写以防不同版本差异 + const escapedTag = tag.replace(/[|<>]/g, '\\$&') + processed = processed.replace(new RegExp(escapedTag, 'gi'), replacement) + } + + // 2. 移除所有剩余的技术标签 + for (const tag of TECH_TAGS) { + const escapedTag = tag.replace(/[|<>]/g, '\\$&') + processed = processed.replace(new RegExp(escapedTag, 'gi'), '') + } + + // 3. 清理多余空格并返回 + return processed.replace(/\s+/g, ' ').trim() +} + // 检查识别结果是否在允许的语言列表中 function isLanguageAllowed(result: any, allowedLanguages: string[]): boolean { if (!result || !result.lang) { - // 如果没有语言信息,默认允许 + // 如果没有语言信息,默认允许(或从文本开头尝试提取) return true } - // 如果没有指定语言或语言列表为空,默认只允许中文 + // 如果没有指定语言或语言列表为空,默认允许中文和粤语 if (!allowedLanguages || allowedLanguages.length === 0) { - allowedLanguages = ['zh'] + allowedLanguages = ['zh', 'yue'] } const langTag = result.lang @@ -55,7 +104,7 @@ async function run() { let sherpa: any; try { sherpa = require('sherpa-onnx-node'); - } catch (requireError) { + } catch (requireError) { parentPort.postMessage({ type: 'error', error: 'Failed to load speech engine: ' + String(requireError) }); return; } @@ -65,11 +114,11 @@ async function run() { // 确保有有效的语言列表,默认只允许中文 let allowedLanguages = languages || ['zh'] if (allowedLanguages.length === 0) { - allowedLanguages = ['zh'] + allowedLanguages = ['zh'] } - + console.log('[TranscribeWorker] 使用的语言白名单:', allowedLanguages) - + // 1. 初始化识别器 (SenseVoiceSmall) const recognizerConfig = { modelConfig: { @@ -83,122 +132,31 @@ async function run() { } } const recognizer = new sherpa.OfflineRecognizer(recognizerConfig) - // 2. 初始化 VAD (用于流式输出效果) - const vadPath = modelPath.replace('model.int8.onnx', 'silero_vad.onnx'); - const vadConfig = { - sileroVad: { - model: vadPath, - threshold: 0.5, - minSilenceDuration: 0.5, - minSpeechDuration: 0.25, - windowSize: 512 - }, - sampleRate: sampleRate, - debug: 0, - numThreads: 1 - } - // 检查 VAD 模型是否存在,如果不存在则退回到全量识别 - if (!fs.existsSync(vadPath)) { - const pcmData = wavData.slice(44) - const samples = new Float32Array(pcmData.length / 2) - for (let i = 0; i < samples.length; i++) { - samples[i] = pcmData.readInt16LE(i * 2) / 32768.0 - } - - const stream = recognizer.createStream() - stream.acceptWaveform({ sampleRate, samples }) - recognizer.decode(stream) - const result = recognizer.getResult(stream) - - console.log('[TranscribeWorker] 非VAD模式 - 识别结果对象:', JSON.stringify(result, null, 2)) - - // 检查语言是否在白名单中 - if (isLanguageAllowed(result, allowedLanguages)) { - console.log('[TranscribeWorker] 非VAD模式 - 保留文本:', result.text) - parentPort.postMessage({ type: 'final', text: result.text }) - } else { - console.log('[TranscribeWorker] 非VAD模式 - 语言不匹配,返回空文本') - parentPort.postMessage({ type: 'final', text: '' }) - } - return - } - - const vad = new sherpa.Vad(vadConfig, 60) // 60s max - // 3. 处理音频数据 + // 2. 处理音频数据 (全量识别) const pcmData = wavData.slice(44) const samples = new Float32Array(pcmData.length / 2) for (let i = 0; i < samples.length; i++) { samples[i] = pcmData.readInt16LE(i * 2) / 32768.0 } - // 模拟流式输入:按小块喂给 VAD - const chunkSize = 1600 // 100ms for 16kHz - let offset = 0 - let accumulatedText = '' + const stream = recognizer.createStream() + stream.acceptWaveform({ sampleRate, samples }) + recognizer.decode(stream) + const result = recognizer.getResult(stream) - let segmentCount = 0; + console.log('[TranscribeWorker] 识别完成 - 结果对象:', JSON.stringify(result, null, 2)) - while (offset < samples.length) { - const end = Math.min(offset + chunkSize, samples.length) - const chunk = samples.subarray(offset, end) - - vad.acceptWaveform(chunk) - - // 检查 ASR 结果 - while (!vad.isEmpty()) { - const segment = vad.front(false) - - const stream = recognizer.createStream() - stream.acceptWaveform({ sampleRate, samples: segment.samples }) - recognizer.decode(stream) - const result = recognizer.getResult(stream) - - console.log('[TranscribeWorker] 识别结果 - lang:', result.lang, 'text:', result.text) - - // 检查语言是否在白名单中 - if (result.text && isLanguageAllowed(result, allowedLanguages)) { - const text = result.text.trim() - if (text.length > 0) { - accumulatedText += (accumulatedText ? ' ' : '') + text - segmentCount++; - parentPort.postMessage({ type: 'partial', text: accumulatedText }) - } - } else if (result.text) { - console.log('[TranscribeWorker] 跳过不匹配的语言段落') - } - vad.pop() - } - - offset = end - // 让出主循环,保持响应 - await new Promise(resolve => setImmediate(resolve)) + // 3. 检查语言是否在白名单中 + if (isLanguageAllowed(result, allowedLanguages)) { + const processedText = richTranscribePostProcess(result.text) + console.log('[TranscribeWorker] 语言匹配,返回文本:', processedText) + parentPort.postMessage({ type: 'final', text: processedText }) + } else { + console.log('[TranscribeWorker] 语言不匹配,返回空文本') + parentPort.postMessage({ type: 'final', text: '' }) } - // Ensure any remaining buffer is processed - vad.flush(); - while (!vad.isEmpty()) { - const segment = vad.front(false); - const stream = recognizer.createStream() - stream.acceptWaveform({ sampleRate, samples: segment.samples }) - recognizer.decode(stream) - const result = recognizer.getResult(stream) - - console.log('[TranscribeWorker] flush阶段 - lang:', result.lang, 'text:', result.text) - - // 检查语言是否在白名单中 - if (result.text && isLanguageAllowed(result, allowedLanguages)) { - const text = result.text.trim() - if (text) { - accumulatedText += (accumulatedText ? ' ' : '') + text - parentPort.postMessage({ type: 'partial', text: accumulatedText }) - } - } - vad.pop(); - } - - parentPort.postMessage({ type: 'final', text: accumulatedText }) - } catch (error) { parentPort.postMessage({ type: 'error', error: String(error) }) } diff --git a/electron/wcdbWorker.ts b/electron/wcdbWorker.ts index 8195be8..d836a79 100644 --- a/electron/wcdbWorker.ts +++ b/electron/wcdbWorker.ts @@ -111,7 +111,7 @@ if (parentPort) { result = await core.getMessageById(payload.sessionId, payload.localId) break case 'getVoiceData': - result = await core.getVoiceData(payload.sessionId, payload.createTime, payload.candidates, payload.svrId) + result = await core.getVoiceData(payload.sessionId, payload.createTime, payload.candidates, payload.localId, payload.svrId) if (!result.success) { console.error('[wcdbWorker] getVoiceData failed:', result.error) } diff --git a/resources/wcdb_api.dll b/resources/wcdb_api.dll index 745f9f0..de0b445 100644 Binary files a/resources/wcdb_api.dll and b/resources/wcdb_api.dll differ diff --git a/src/components/AnimatedStreamingText.tsx b/src/components/AnimatedStreamingText.tsx index 329f7e9..2b20d9a 100644 --- a/src/components/AnimatedStreamingText.tsx +++ b/src/components/AnimatedStreamingText.tsx @@ -44,12 +44,22 @@ export const AnimatedStreamingText = memo(({ text, className, loading }: Animate ))}