diff --git a/app/agent/llm/capability.py b/app/agent/llm/capability.py index d4953942..a406df73 100644 --- a/app/agent/llm/capability.py +++ b/app/agent/llm/capability.py @@ -146,6 +146,8 @@ class OpenAIChatAudioProvider(AudioCapabilityProvider): ".opus": "audio/ogg", ".wav": "audio/wav", } + TRANSCODED_STT_SUFFIX = ".wav" + TRANSCODED_STT_SAMPLE_RATE = "16000" def _build_client(self, api_key: str, base_url: Optional[str]): from openai import OpenAI @@ -229,6 +231,76 @@ class OpenAIChatAudioProvider(AudioCapabilityProvider): "format": self._guess_audio_format(filename), } + def _normalize_audio_for_transcription( + self, content: bytes, filename: str + ) -> Optional[tuple[bytes, str]]: + """ + 将转写输入归一化为 Chat Audio provider 明确支持的格式。 + + :param content: 原始音频字节 + :param filename: 原始音频文件名 + :return: 成功时返回可提交的音频字节和文件名,失败时返回 None + """ + suffix = Path(filename or "").suffix.lower() + if suffix in self.SUPPORTED_AUDIO_MIME_TYPES: + return content, filename + return self._convert_audio_for_transcription(content=content, filename=filename) + + def _convert_audio_for_transcription( + self, content: bytes, filename: str + ) -> Optional[tuple[bytes, str]]: + """ + 将 AMR 等第三方 STT 不支持的输入转为 WAV。 + + :param content: 原始音频字节 + :param filename: 原始音频文件名 + :return: 成功时返回 WAV 字节和文件名,失败时返回 None + """ + if not shutil.which("ffmpeg"): + logger.warning( + "%s STT 不支持当前音频格式且 ffmpeg 不可用,无法转码: filename=%s", + self.DISPLAY_NAME, + filename, + ) + return None + + suffix = Path(filename or "").suffix.lower() or ".audio" + voice_dir = settings.TEMP_PATH / "voice" + voice_dir.mkdir(parents=True, exist_ok=True) + input_path = voice_dir / f"{uuid4().hex}{suffix}" + output_path = input_path.with_suffix(self.TRANSCODED_STT_SUFFIX) + try: + input_path.write_bytes(content) + cmd = [ + "ffmpeg", + "-y", + "-i", + str(input_path), + "-ar", + self.TRANSCODED_STT_SAMPLE_RATE, + "-ac", + "1", + "-f", + "wav", + str(output_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + if result.returncode != 0 or not output_path.exists(): + logger.warning( + "%s STT 音频转 WAV 失败: returncode=%s, stderr=%s", + self.DISPLAY_NAME, + result.returncode, + (result.stderr or "").strip()[:500], + ) + return None + return output_path.read_bytes(), f"{input_path.stem}{self.TRANSCODED_STT_SUFFIX}" + finally: + for temp_path in (input_path, output_path): + try: + temp_path.unlink(missing_ok=True) + except OSError as err: + logger.debug(f"清理 STT 临时音频失败: path={temp_path}, error={err}") + @staticmethod def _extract_message_text(message) -> Optional[str]: """兼容音频理解响应可能放在 content 或 reasoning_content 的情况。""" @@ -310,6 +382,12 @@ class OpenAIChatAudioProvider(AudioCapabilityProvider): if not api_key: raise ValueError("音频输入 provider 未配置 API Key") client = self._build_client(api_key=api_key, base_url=base_url) + normalized_audio = self._normalize_audio_for_transcription( + content=content, filename=filename + ) + if not normalized_audio: + return None + content, filename = normalized_audio language = (settings.AUDIO_INPUT_LANGUAGE or "").strip() prompt = "请将这段音频完整转写为文字,只输出转写结果,不要添加解释。" if language: diff --git a/tests/test_agent_llm_capability.py b/tests/test_agent_llm_capability.py index 0fcf9ee0..6b58e4a1 100644 --- a/tests/test_agent_llm_capability.py +++ b/tests/test_agent_llm_capability.py @@ -232,6 +232,38 @@ class AgentCapabilityManagerTest(unittest.TestCase): ) self.assertIn("只输出转写结果", content[1]["text"]) + def test_mimo_stt_transcodes_amr_before_payload(self): + """校验 MiMo 音频输入会先将企业微信 AMR 转为受支持的 WAV。""" + provider = MiMoAudioProvider() + fake_client = Mock() + fake_client.chat.completions.create.return_value = SimpleNamespace( + choices=[SimpleNamespace(message=SimpleNamespace(content="你好"))] + ) + + with patch.object(provider, "_build_client", return_value=fake_client), patch.object( + provider, + "_convert_audio_for_transcription", + return_value=(b"wav-bytes", "input.wav"), + ) as convert_audio, patch.object(settings, "AUDIO_INPUT_MODEL", "mimo-v2.5"), patch.object( + settings, "AUDIO_INPUT_LANGUAGE", "zh" + ), patch.object( + settings, "AUDIO_INPUT_API_KEY", "sk-test" + ), patch.object( + settings, "AUDIO_INPUT_BASE_URL", "https://api.xiaomimimo.com/v1" + ): + result = provider.transcribe_audio(b"amr-bytes", filename="input.amr") + + self.assertEqual(result, "你好") + convert_audio.assert_called_once_with( + content=b"amr-bytes", filename="input.amr" + ) + request = fake_client.chat.completions.create.call_args.kwargs + content = request["messages"][0]["content"] + self.assertEqual( + content[0]["input_audio"]["data"], + f"data:audio/wav;base64,{b64encode(b'wav-bytes').decode('utf-8')}", + ) + def test_minimax_stt_normalizes_openai_default_model(self): """校验 MiniMax 音频输入会把 OpenAI 默认模型兜底为 MiniMax 模型。""" provider = MiniMaxAudioProvider()