From 766d2699ea437cfdf72fd49b1dd1cc9c049e94ba Mon Sep 17 00:00:00 2001 From: jxxghp Date: Mon, 25 May 2026 11:42:57 +0800 Subject: [PATCH] feat: support MiniMax audio provider --- app/agent/llm/capability.py | 158 ++++++++++++++++++++++++++++- app/core/config.py | 4 +- tests/test_agent_llm_capability.py | 62 +++++++++++ 3 files changed, 221 insertions(+), 3 deletions(-) diff --git a/app/agent/llm/capability.py b/app/agent/llm/capability.py index 9374e873..d4953942 100644 --- a/app/agent/llm/capability.py +++ b/app/agent/llm/capability.py @@ -9,11 +9,12 @@ import subprocess from abc import ABC from io import BytesIO from pathlib import Path -from typing import Dict, Optional +from typing import Any, Dict, Optional from uuid import uuid4 from app.core.config import settings from app.log import logger +from app.utils.http import RequestUtils class AgentCapabilityProvider(ABC): @@ -411,6 +412,160 @@ class MiMoAudioProvider(OpenAIChatAudioProvider): return model +class MiniMaxAudioProvider(OpenAIChatAudioProvider): + """MiniMax 音频 provider,语音合成使用官方 T2A HTTP 接口。""" + + name = "minimax" + DISPLAY_NAME = "MiniMax" + DEFAULT_BASE_URL = "https://api.minimaxi.com/v1" + DEFAULT_STT_MODEL = "MiniMax-M2.7" + DEFAULT_TTS_MODEL = "speech-2.8-turbo" + DEFAULT_VOICE = "Chinese (Mandarin)_Lyrical_Voice" + AUDIO_INPUT_DATA_URL = True + SUPPORTED_TTS_MODELS = frozenset( + { + "speech-2.8-hd", + "speech-2.8-turbo", + "speech-2.6-hd", + "speech-2.6-turbo", + "speech-02-hd", + "speech-02-turbo", + "speech-01-hd", + "speech-01-turbo", + } + ) + + def _build_client(self, api_key: str, base_url: Optional[str]): + """构建 MiniMax OpenAI 兼容客户端,兼容用户误填 Anthropic 端点的情况。""" + from openai import OpenAI + + return OpenAI( + api_key=api_key, + base_url=self._normalize_api_base_url(base_url), + max_retries=3, + ) + + @classmethod + def _normalize_api_base_url(cls, base_url: Optional[str]) -> str: + """归一化 MiniMax API 基础 URL,确保后续可以拼接 OpenAI/T2A 路径。""" + normalized = (base_url or cls.DEFAULT_BASE_URL).strip().rstrip("/") + if normalized.endswith("/t2a_v2"): + normalized = normalized[: -len("/t2a_v2")] + for suffix in ("/anthropic/v1", "/openai/v1"): + if normalized.endswith(suffix): + return normalized[: -len(suffix)] + "/v1" + if not normalized.endswith("/v1"): + normalized = f"{normalized}/v1" + return normalized + + @classmethod + def _build_t2a_url(cls, base_url: Optional[str]) -> str: + """生成 MiniMax 同步 T2A 接口地址。""" + return f"{cls._normalize_api_base_url(base_url)}/t2a_v2" + + def _normalize_stt_model(self) -> str: + """将非 MiniMax 的默认转写模型名兜底为 MiniMax 对话模型。""" + model = (settings.AUDIO_INPUT_MODEL or "").strip() + if not model or model.lower().startswith(("gpt-", "mimo-")): + return self.DEFAULT_STT_MODEL + return model + + def _normalize_tts_model(self) -> str: + """将非 MiniMax 语音模型兜底为官方 T2A 模型。""" + model = (settings.AUDIO_OUTPUT_MODEL or "").strip().lower() + if model in self.SUPPORTED_TTS_MODELS: + return model + return self.DEFAULT_TTS_MODEL + + def _normalize_voice_id(self) -> str: + """将其他 provider 的默认音色兜底为 MiniMax 中文系统音色。""" + voice_id = (settings.AUDIO_OUTPUT_VOICE or "").strip() + if not voice_id or voice_id in {"alloy", "mimo_default"}: + return self.DEFAULT_VOICE + return voice_id + + @staticmethod + def _decode_audio_payload(audio_data: str) -> bytes: + """解析 MiniMax T2A 返回的音频数据,优先按官方 hex 格式处理。""" + normalized = "".join((audio_data or "").split()) + try: + return bytes.fromhex(normalized) + except ValueError: + return base64.b64decode(audio_data) + + @staticmethod + def _extract_minimax_error(data: dict[str, Any]) -> Optional[str]: + """提取 MiniMax base_resp 错误信息,成功响应返回 None。""" + base_resp = data.get("base_resp") or {} + status_code = base_resp.get("status_code") + if status_code in (None, 0, "0"): + return None + status_msg = base_resp.get("status_msg") or "unknown error" + return f"{status_code}: {status_msg}" + + def synthesize_speech(self, text: str) -> Optional[Path]: + """调用 MiniMax T2A HTTP 接口合成语音文件。""" + if not text: + return None + + try: + api_key, base_url = self._output_credentials() + if not api_key: + raise ValueError("音频输出 provider 未配置 API Key") + response = RequestUtils( + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + proxies=settings.PROXY or {}, + timeout=60, + ).post_res( + url=self._build_t2a_url(base_url), + json={ + "model": self._normalize_tts_model(), + "text": text, + "stream": False, + "language_boost": "auto", + "output_format": "hex", + "voice_setting": { + "voice_id": self._normalize_voice_id(), + "speed": 1, + "vol": 1, + "pitch": 0, + }, + "audio_setting": { + "sample_rate": 32000, + "bitrate": 128000, + "format": "opus", + "channel": 1, + }, + }, + ) + if not response: + raise ValueError("MiniMax T2A 请求无响应") + if response.status_code >= 400: + raise ValueError(f"MiniMax T2A HTTP {response.status_code}") + + result = response.json() + minimax_error = self._extract_minimax_error(result) + if minimax_error: + raise ValueError(f"MiniMax T2A 返回错误: {minimax_error}") + + audio_data = ((result.get("data") or {}).get("audio") or "").strip() + if not audio_data: + raise ValueError("MiniMax T2A 响应中没有音频数据") + + voice_dir = settings.TEMP_PATH / "voice" + voice_dir.mkdir(parents=True, exist_ok=True) + output_path = voice_dir / f"{uuid4().hex}.opus" + output_path.write_bytes(self._decode_audio_payload(audio_data)) + return output_path + except Exception as err: + logger.error(f"音频输出合成失败: provider={self.name}, error={err}") + return None + + class AgentCapabilityManager: """Agent 能力统一入口。""" @@ -420,6 +575,7 @@ class AgentCapabilityManager: OpenAIAudioProvider.name: OpenAIAudioProvider(), OpenAIChatAudioProvider.name: OpenAIChatAudioProvider(), MiMoAudioProvider.name: MiMoAudioProvider(), + MiniMaxAudioProvider.name: MiniMaxAudioProvider(), } @classmethod diff --git a/app/core/config.py b/app/core/config.py index 832486d4..cb832f35 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -600,7 +600,7 @@ class ConfigModel(BaseModel): # AI智能体自动重试整理失败记录开关 AI_AGENT_RETRY_TRANSFER: bool = False - # 音频输入提供商:openai/openai_chat_audio/mimo + # 音频输入提供商:openai/openai_chat_audio/mimo/minimax AUDIO_INPUT_PROVIDER: str = "openai" # 音频输入 API 密钥 AUDIO_INPUT_API_KEY: Optional[str] = None @@ -610,7 +610,7 @@ class ConfigModel(BaseModel): AUDIO_INPUT_MODEL: str = "gpt-4o-mini-transcribe" # 音频输入识别语言 AUDIO_INPUT_LANGUAGE: str = "zh" - # 音频输出提供商:openai/openai_chat_audio/mimo + # 音频输出提供商:openai/openai_chat_audio/mimo/minimax AUDIO_OUTPUT_PROVIDER: str = "openai" # 音频输出 API 密钥 AUDIO_OUTPUT_API_KEY: Optional[str] = None diff --git a/tests/test_agent_llm_capability.py b/tests/test_agent_llm_capability.py index 12465874..0fcf9ee0 100644 --- a/tests/test_agent_llm_capability.py +++ b/tests/test_agent_llm_capability.py @@ -21,6 +21,7 @@ spec.loader.exec_module(capability_module) AgentCapabilityManager = capability_module.AgentCapabilityManager MiMoAudioProvider = capability_module.MiMoAudioProvider +MiniMaxAudioProvider = capability_module.MiniMaxAudioProvider OpenAIChatAudioProvider = capability_module.OpenAIChatAudioProvider OpenAIAudioProvider = capability_module.OpenAIAudioProvider @@ -32,6 +33,7 @@ class AgentCapabilityManagerTest(unittest.TestCase): "openai_chat_audio", AgentCapabilityManager.get_registered_audio_providers() ) self.assertIn("mimo", AgentCapabilityManager.get_registered_audio_providers()) + self.assertIn("minimax", AgentCapabilityManager.get_registered_audio_providers()) def test_get_audio_provider_uses_separate_input_and_output_settings(self): with patch.object(settings, "AUDIO_INPUT_PROVIDER", "openai"), patch.object( @@ -230,6 +232,66 @@ class AgentCapabilityManagerTest(unittest.TestCase): ) self.assertIn("只输出转写结果", content[1]["text"]) + def test_minimax_stt_normalizes_openai_default_model(self): + """校验 MiniMax 音频输入会把 OpenAI 默认模型兜底为 MiniMax 模型。""" + provider = MiniMaxAudioProvider() + + with patch.object(settings, "AUDIO_INPUT_MODEL", "gpt-4o-mini-transcribe"): + self.assertEqual(provider._normalize_stt_model(), "MiniMax-M2.7") + + def test_minimax_tts_uses_t2a_http_payload(self): + """校验 MiniMax 音频输出会调用官方 T2A HTTP 接口并写入音频文件。""" + provider = MiniMaxAudioProvider() + fake_response = SimpleNamespace( + status_code=200, + json=Mock( + return_value={ + "data": {"audio": b"opus-bytes".hex(), "status": 2}, + "base_resp": {"status_code": 0, "status_msg": "success"}, + } + ), + ) + request_utils = Mock() + request_utils.post_res.return_value = fake_response + + with TemporaryDirectory() as temp_dir, patch.object( + capability_module, "RequestUtils", return_value=request_utils + ) as request_utils_cls, patch.object( + capability_module, + "settings", + SimpleNamespace( + TEMP_PATH=Path(temp_dir), + PROXY={}, + AUDIO_OUTPUT_MODEL="gpt-4o-mini-tts", + AUDIO_OUTPUT_VOICE="alloy", + AUDIO_OUTPUT_API_KEY="sk-test", + AUDIO_OUTPUT_BASE_URL="https://api.minimaxi.com/anthropic/v1", + ), + ): + output_path = provider.synthesize_speech("你好") + output_bytes = output_path.read_bytes() if output_path else None + + self.assertIsNotNone(output_path) + self.assertEqual(output_bytes, b"opus-bytes") + request_utils_cls.assert_called_once() + request = request_utils.post_res.call_args.kwargs + self.assertEqual(request["url"], "https://api.minimaxi.com/v1/t2a_v2") + self.assertEqual(request["json"]["model"], "speech-2.8-turbo") + self.assertEqual( + request["json"]["voice_setting"]["voice_id"], + "Chinese (Mandarin)_Lyrical_Voice", + ) + self.assertEqual(request["json"]["audio_setting"]["format"], "opus") + + def test_minimax_tts_accepts_base64_audio_payload(self): + """校验 MiniMax 音频解析兼容部分代理返回的 base64 音频数据。""" + provider = MiniMaxAudioProvider() + + self.assertEqual( + provider._decode_audio_payload(b64encode(b"opus-bytes").decode("utf-8")), + b"opus-bytes", + ) + if __name__ == "__main__": unittest.main()