feat: 实现语音转文字并支持流式输出；

fix: 修复了语音解密失败的问题
2026-06-06 15:11:51 +00:00 · 2026-01-17 14:16:54 +08:00
parent 650de55202
commit e8babd48b6
33 changed files with 1713 additions and 570 deletions
--- a/electron/services/voiceTranscribeService.ts
+++ b/electron/services/voiceTranscribeService.ts
@@ -1,19 +1,23 @@
 import { app } from 'electron'
-import { createWriteStream, existsSync, mkdirSync, statSync, unlinkSync, writeFileSync } from 'fs'
-import { join, dirname } from 'path'
-import { promisify } from 'util'
-import { execFile, spawnSync } from 'child_process'
+import { existsSync, mkdirSync, statSync, unlinkSync, createWriteStream } from 'fs'
+import { join } from 'path'
 import * as https from 'https'
 import * as http from 'http'
 import { ConfigService } from './config'

-const execFileAsync = promisify(execFile)
+// Sherpa-onnx 类型定义
+type OfflineRecognizer = any
+type OfflineStream = any

-type WhisperModelInfo = {
+type ModelInfo = {
  name: string
-  fileName: string
+  files: {
+    model: string
+    tokens: string
+    vad: string
+  }
+  sizeBytes: number
  sizeLabel: string
-  sizeBytes?: number
 }

 type DownloadProgress = {
@@ -23,122 +27,169 @@ type DownloadProgress = {
  percent?: number
 }

-const WHISPER_MODELS: Record<string, WhisperModelInfo> = {
-  tiny: { name: 'tiny', fileName: 'ggml-tiny.bin', sizeLabel: '75 MB', sizeBytes: 75_000_000 },
-  base: { name: 'base', fileName: 'ggml-base.bin', sizeLabel: '142 MB', sizeBytes: 142_000_000 },
-  small: { name: 'small', fileName: 'ggml-small.bin', sizeLabel: '466 MB', sizeBytes: 466_000_000 },
-  medium: { name: 'medium', fileName: 'ggml-medium.bin', sizeLabel: '1.5 GB', sizeBytes: 1_500_000_000 },
-  'large-v3': { name: 'large-v3', fileName: 'ggml-large-v3.bin', sizeLabel: '2.9 GB', sizeBytes: 2_900_000_000 }
+const SENSEVOICE_MODEL: ModelInfo = {
+  name: 'SenseVoiceSmall',
+  files: {
+    model: 'model.int8.onnx',
+    tokens: 'tokens.txt',
+    vad: 'silero_vad.onnx'
+  },
+  sizeBytes: 245_000_000,
+  sizeLabel: '245 MB'
 }

-const WHISPER_SOURCES: Record<string, string> = {
-  official: 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main',
-  tsinghua: 'https://hf-mirror.com/ggerganov/whisper.cpp/resolve/main'
-}
-
-function getStaticFfmpegPath(): string | null {
-  try {
-    // eslint-disable-next-line @typescript-eslint/no-var-requires
-    const ffmpegStatic = require('ffmpeg-static')
-    if (typeof ffmpegStatic === 'string' && existsSync(ffmpegStatic)) {
-      return ffmpegStatic
-    }
-    const devPath = join(process.cwd(), 'node_modules', 'ffmpeg-static', 'ffmpeg.exe')
-    if (existsSync(devPath)) {
-      return devPath
-    }
-    if (app.isPackaged) {
-      const resourcesPath = process.resourcesPath
-      const packedPath = join(resourcesPath, 'app.asar.unpacked', 'node_modules', 'ffmpeg-static', 'ffmpeg.exe')
-      if (existsSync(packedPath)) {
-        return packedPath
-      }
-    }
-    return null
-  } catch {
-    return null
-  }
+const MODEL_DOWNLOAD_URLS = {
+  model: 'https://modelscope.cn/models/pengzhendong/sherpa-onnx-sense-voice-zh-en-ja-ko-yue/resolve/master/model.int8.onnx',
+  tokens: 'https://modelscope.cn/models/pengzhendong/sherpa-onnx-sense-voice-zh-en-ja-ko-yue/resolve/master/tokens.txt',
+  vad: 'https://www.modelscope.cn/models/manyeyes/silero-vad-onnx/resolve/master/silero_vad.onnx'
 }

 export class VoiceTranscribeService {
  private configService = new ConfigService()
  private downloadTasks = new Map<string, Promise<{ success: boolean; path?: string; error?: string }>>()
+  private recognizer: OfflineRecognizer | null = null
+  private isInitializing = false

-  private resolveModelInfo(modelName: string): WhisperModelInfo | null {
-    return WHISPER_MODELS[modelName] || null
-  }
-
-  private resolveModelDir(overrideDir?: string): string {
-    const configured = overrideDir || this.configService.get('whisperModelDir')
+  private resolveModelDir(): string {
+    const configured = this.configService.get('whisperModelDir') as string | undefined
    if (configured) return configured
-    return join(app.getPath('userData'), 'models', 'whisper')
+    return join(app.getPath('documents'), 'WeFlow', 'models', 'sensevoice')
  }

-  private resolveModelPath(modelName: string, overrideDir?: string): string | null {
-    const info = this.resolveModelInfo(modelName)
-    if (!info) return null
-    return join(this.resolveModelDir(overrideDir), info.fileName)
+  private resolveModelPath(fileName: string): string {
+    return join(this.resolveModelDir(), fileName)
  }

-  private resolveSourceUrl(overrideSource?: string): string {
-    const configured = overrideSource || this.configService.get('whisperDownloadSource')
-    if (configured && WHISPER_SOURCES[configured]) return WHISPER_SOURCES[configured]
-    return WHISPER_SOURCES.official
-  }
-
-  async getModelStatus(payload: { modelName: string; downloadDir?: string }): Promise<{
+  /**
+   * 检查模型状态
+   */
+  async getModelStatus(): Promise<{
    success: boolean
    exists?: boolean
-    path?: string
+    modelPath?: string
+    tokensPath?: string
    sizeBytes?: number
    error?: string
  }> {
-    const modelPath = this.resolveModelPath(payload.modelName, payload.downloadDir)
-    if (!modelPath) {
-      return { success: false, error: '未知模型名称' }
+    try {
+      const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model)
+      const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens)
+      const vadPath = this.resolveModelPath((SENSEVOICE_MODEL.files as any).vad)
+
+      const modelExists = existsSync(modelPath)
+      const tokensExists = existsSync(tokensPath)
+      const vadExists = existsSync(vadPath)
+      const exists = modelExists && tokensExists && vadExists
+
+      if (!exists) {
+        return { success: true, exists: false, modelPath, tokensPath }
+      }
+
+      const modelSize = statSync(modelPath).size
+      const tokensSize = statSync(tokensPath).size
+      const vadSize = statSync(vadPath).size
+      const totalSize = modelSize + tokensSize + vadSize
+
+      return {
+        success: true,
+        exists: true,
+        modelPath,
+        tokensPath,
+        sizeBytes: totalSize
+      }
+    } catch (error) {
+      console.error('[VoiceTranscribe] getModelStatus error:', error)
+      return { success: false, error: String(error) }
    }
-    if (!existsSync(modelPath)) {
-      return { success: true, exists: false, path: modelPath }
-    }
-    const sizeBytes = statSync(modelPath).size
-    return { success: true, exists: true, path: modelPath, sizeBytes }
  }

+  /**
+   * 下载模型文件
+   */
  async downloadModel(
-    payload: { modelName: string; downloadDir?: string; source?: string },
    onProgress?: (progress: DownloadProgress) => void
-  ): Promise<{ success: boolean; path?: string; error?: string }> {
-    const info = this.resolveModelInfo(payload.modelName)
-    if (!info) {
-      return { success: false, error: '未知模型名称' }
-    }
-
-    const modelPath = this.resolveModelPath(payload.modelName, payload.downloadDir)
-    if (!modelPath) {
-      return { success: false, error: '模型路径生成失败' }
-    }
-
-    if (existsSync(modelPath)) {
-      return { success: true, path: modelPath }
-    }
-
-    const cacheKey = `${payload.modelName}:${modelPath}`
+  ): Promise<{ success: boolean; modelPath?: string; tokensPath?: string; error?: string }> {
+    const cacheKey = 'sensevoice'
    const pending = this.downloadTasks.get(cacheKey)
    if (pending) return pending

    const task = (async () => {
      try {
-        const targetDir = this.resolveModelDir(payload.downloadDir)
-        if (!existsSync(targetDir)) {
-          mkdirSync(targetDir, { recursive: true })
+        const modelDir = this.resolveModelDir()
+        if (!existsSync(modelDir)) {
+          mkdirSync(modelDir, { recursive: true })
        }

-        const baseUrl = this.resolveSourceUrl(payload.source)
-        const url = `${baseUrl}/${info.fileName}`
-        await this.downloadToFile(url, modelPath, payload.modelName, onProgress)
-        return { success: true, path: modelPath }
+        const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model)
+        const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens)
+        const vadPath = this.resolveModelPath((SENSEVOICE_MODEL.files as any).vad)
+
+        // 下载模型文件 (40%)
+        console.info('[VoiceTranscribe] 开始下载模型文件...')
+        await this.downloadToFile(
+          MODEL_DOWNLOAD_URLS.model,
+          modelPath,
+          'model',
+          (downloaded, total) => {
+            const percent = total ? (downloaded / total) * 40 : undefined
+            onProgress?.({
+              modelName: SENSEVOICE_MODEL.name,
+              downloadedBytes: downloaded,
+              totalBytes: SENSEVOICE_MODEL.sizeBytes,
+              percent
+            })
+          }
+        )
+
+        // 下载 tokens 文件 (30%)
+        console.info('[VoiceTranscribe] 开始下载 tokens 文件...')
+        await this.downloadToFile(
+          MODEL_DOWNLOAD_URLS.tokens,
+          tokensPath,
+          'tokens',
+          (downloaded, total) => {
+            const modelSize = existsSync(modelPath) ? statSync(modelPath).size : 0
+            const percent = total ? 40 + (downloaded / total) * 30 : 40
+            onProgress?.({
+              modelName: SENSEVOICE_MODEL.name,
+              downloadedBytes: modelSize + downloaded,
+              totalBytes: SENSEVOICE_MODEL.sizeBytes,
+              percent
+            })
+          }
+        )
+
+        // 下载 vad 文件 (30%)
+        console.info('[VoiceTranscribe] 开始下载 VAD 文件...')
+        await this.downloadToFile(
+          (MODEL_DOWNLOAD_URLS as any).vad,
+          vadPath,
+          'vad',
+          (downloaded, total) => {
+            const modelSize = existsSync(modelPath) ? statSync(modelPath).size : 0
+            const tokensSize = existsSync(tokensPath) ? statSync(tokensPath).size : 0
+            const percent = total ? 70 + (downloaded / total) * 30 : 70
+            onProgress?.({
+              modelName: SENSEVOICE_MODEL.name,
+              downloadedBytes: modelSize + tokensSize + downloaded,
+              totalBytes: SENSEVOICE_MODEL.sizeBytes,
+              percent
+            })
+          }
+        )
+
+        console.info('[VoiceTranscribe] 模型下载完成')
+        return { success: true, modelPath, tokensPath }
      } catch (error) {
-        try { if (existsSync(modelPath)) unlinkSync(modelPath) } catch { }
+        console.error('[VoiceTranscribe] 下载失败:', error)
+        const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model)
+        const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens)
+        const vadPath = this.resolveModelPath((SENSEVOICE_MODEL.files as any).vad)
+        try {
+          if (existsSync(modelPath)) unlinkSync(modelPath)
+          if (existsSync(tokensPath)) unlinkSync(tokensPath)
+          if (existsSync(vadPath)) unlinkSync(vadPath)
+        } catch { }
        return { success: false, error: String(error) }
      } finally {
        this.downloadTasks.delete(cacheKey)
@@ -149,102 +200,108 @@ export class VoiceTranscribeService {
    return task
  }

-  async transcribeWavBuffer(wavData: Buffer): Promise<{ success: boolean; transcript?: string; error?: string }> {
-    const modelName = this.configService.get('whisperModelName') || 'base'
-    const modelPath = this.resolveModelPath(modelName)
-    console.info('[VoiceTranscribe] check model', { modelName, modelPath, exists: modelPath ? existsSync(modelPath) : false })
-    if (!modelPath || !existsSync(modelPath)) {
-      return { success: false, error: '未下载语音模型，请在设置中下载' }
-    }
+  /**
+   * 转写 WAV 音频数据 (后台 Worker Threads 版本)
+   */
+  async transcribeWavBuffer(
+    wavData: Buffer,
+    onPartial?: (text: string) => void
+  ): Promise<{ success: boolean; transcript?: string; error?: string }> {
+    return new Promise((resolve) => {
+      try {
+        const modelPath = this.resolveModelPath(SENSEVOICE_MODEL.files.model)
+        const tokensPath = this.resolveModelPath(SENSEVOICE_MODEL.files.tokens)

-    // 使用内置的预编译 whisper-cli.exe
-    const resourcesPath = app.isPackaged
-      ? join(process.resourcesPath, 'resources')
-      : join(app.getAppPath(), 'resources')
-    const whisperExe = join(resourcesPath, 'whisper-cli.exe')
-    
-    if (!existsSync(whisperExe)) {
-      return { success: false, error: '找不到语音转写程序，请重新安装应用' }
-    }
+        if (!existsSync(modelPath) || !existsSync(tokensPath)) {
+          resolve({ success: false, error: '模型文件不存在，请先下载模型' })
+          return
+        }

-    const ffmpegPath = getStaticFfmpegPath() || 'ffmpeg'
-    console.info('[VoiceTranscribe] ffmpeg path', ffmpegPath)
+        const { Worker } = require('worker_threads')
+        // main.js 和 transcribeWorker.js 同在 dist-electron 目录下
+        const workerPath = join(__dirname, 'transcribeWorker.js')

-    const tempDir = app.getPath('temp')
-    const fileToken = `${Date.now()}_${Math.random().toString(16).slice(2)}`
-    const inputPath = join(tempDir, `weflow_voice_${fileToken}.wav`)
-    const outputPath = join(tempDir, `weflow_voice_${fileToken}_16k.wav`)
+        console.info('[VoiceTranscribe] 启动后台 Worker 转写...', { workerPath })

-    try {
-      writeFileSync(inputPath, wavData)
-      console.info('[VoiceTranscribe] converting to 16kHz', { inputPath, outputPath })
-      await execFileAsync(ffmpegPath, ['-y', '-i', inputPath, '-ar', '16000', '-ac', '1', outputPath])
-      
-      console.info('[VoiceTranscribe] transcribing with whisper', { whisperExe, modelPath })
-      const { stdout, stderr } = await execFileAsync(whisperExe, [
-        '-m', modelPath,
-        '-f', outputPath,
-        '-l', 'zh',
-        '-otxt',
-        '-np'  // no prints (只输出结果)
-      ], {
-        maxBuffer: 10 * 1024 * 1024,
-        cwd: dirname(whisperExe),  // 设置工作目录为 whisper-cli.exe 所在目录，确保能找到 DLL
-        env: { ...process.env, PATH: `${dirname(whisperExe)};${process.env.PATH}` }
-      })
+        const worker = new Worker(workerPath, {
+          workerData: {
+            modelPath,
+            tokensPath,
+            wavData,
+            sampleRate: 16000
+          }
+        })

-      console.info('[VoiceTranscribe] whisper stdout:', stdout)
-      if (stderr) console.warn('[VoiceTranscribe] whisper stderr:', stderr)
+        let finalTranscript = ''

-      // 解析输出文本
-      const outputBase = outputPath.replace(/\.[^.]+$/, '')
-      const txtFile = `${outputBase}.txt`
-      let transcript = ''
-      if (existsSync(txtFile)) {
-        const { readFileSync } = await import('fs')
-        transcript = readFileSync(txtFile, 'utf-8').trim()
-        unlinkSync(txtFile)
-      } else {
-        // 从 stdout 提取（使用 -np 参数后，stdout 只有转写结果）
-        transcript = stdout.trim()
+        worker.on('message', (msg: any) => {
+          if (msg.type === 'partial') {
+            onPartial?.(msg.text)
+          } else if (msg.type === 'final') {
+            finalTranscript = msg.text
+            resolve({ success: true, transcript: finalTranscript })
+            worker.terminate()
+          } else if (msg.type === 'error') {
+            resolve({ success: false, error: msg.error })
+            worker.terminate()
+          }
+        })
+
+        worker.on('error', (err: Error) => {
+          console.error('[VoiceTranscribe] Worker error:', err)
+          resolve({ success: false, error: String(err) })
+        })
+
+        worker.on('exit', (code: number) => {
+          if (code !== 0) {
+            console.error(`[VoiceTranscribe] Worker stopped with exit code ${code}`)
+            resolve({ success: false, error: `Worker exited with code ${code}` })
+          }
+        })
+
+      } catch (error) {
+        console.error('[VoiceTranscribe] 启动 Worker 失败:', error)
+        resolve({ success: false, error: String(error) })
      }
-
-      console.info('[VoiceTranscribe] success', { transcript })
-      return { success: true, transcript }
-    } catch (error: any) {
-      console.error('[VoiceTranscribe] failed', error)
-      console.error('[VoiceTranscribe] stderr:', error.stderr)
-      console.error('[VoiceTranscribe] stdout:', error.stdout)
-      return { success: false, error: String(error) }
-    } finally {
-      try { if (existsSync(inputPath)) unlinkSync(inputPath) } catch { }
-      try { if (existsSync(outputPath)) unlinkSync(outputPath) } catch { }
-    }
+    })
  }

+  /**
+   * 下载文件
+   */
  private downloadToFile(
    url: string,
    targetPath: string,
-    modelName: string,
-    onProgress?: (progress: DownloadProgress) => void,
-    remainingRedirects = 3
+    fileName: string,
+    onProgress?: (downloaded: number, total?: number) => void,
+    remainingRedirects = 5
  ): Promise<void> {
    return new Promise((resolve, reject) => {
      const protocol = url.startsWith('https') ? https : http
-      const request = protocol.get(url, (response) => {
+      console.info(`[VoiceTranscribe] 下载 ${fileName}:`, url)
+
+      const options = {
+        headers: {
+          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        }
+      }
+
+      const request = protocol.get(url, options, (response) => {
+        // 处理重定向
        if ([301, 302, 303, 307, 308].includes(response.statusCode || 0) && response.headers.location) {
          if (remainingRedirects <= 0) {
-            reject(new Error('下载重定向次数过多'))
+            reject(new Error('重定向次数过多'))
            return
          }
-          this.downloadToFile(response.headers.location, targetPath, modelName, onProgress, remainingRedirects - 1)
+          console.info(`[VoiceTranscribe] 重定向到:`, response.headers.location)
+          this.downloadToFile(response.headers.location, targetPath, fileName, onProgress, remainingRedirects - 1)
            .then(resolve)
            .catch(reject)
          return
        }

        if (response.statusCode !== 200) {
-          reject(new Error(`下载失败: ${response.statusCode}`))
+          reject(new Error(`下载失败: HTTP ${response.statusCode}`))
          return
        }

@@ -255,8 +312,7 @@ export class VoiceTranscribeService {

        response.on('data', (chunk) => {
          downloadedBytes += chunk.length
-          const percent = totalBytes ? (downloadedBytes / totalBytes) * 100 : undefined
-          onProgress?.({ modelName, downloadedBytes, totalBytes, percent })
+          onProgress?.(downloadedBytes, totalBytes)
        })

        response.on('error', (error) => {
@@ -271,15 +327,33 @@ export class VoiceTranscribeService {

        writer.on('finish', () => {
          writer.close()
+          console.info(`[VoiceTranscribe] ${fileName} 下载完成:`, targetPath)
          resolve()
        })

        response.pipe(writer)
      })

-      request.on('error', reject)
+      request.on('error', (error) => {
+        console.error(`[VoiceTranscribe] ${fileName} 下载错误:`, error)
+        reject(error)
+      })
    })
  }
+
+  /**
+   * 清理资源
+   */
+  dispose() {
+    if (this.recognizer) {
+      try {
+        // sherpa-onnx 的 recognizer 可能需要手动释放
+        this.recognizer = null
+      } catch (error) {
+        console.error('[VoiceTranscribe] 释放识别器失败:', error)
+      }
+    }
+  }
 }

 export const voiceTranscribeService = new VoiceTranscribeService()