From 896631d63e4c5b50fa74900f7ba4ebb59acfbf92 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Tue, 26 May 2026 20:14:56 +0800 Subject: [PATCH] fix(agent): enable voice replies for supported channels --- app/agent/llm/capability.py | 64 +++++++++++++----- app/agent/tools/impl/send_voice_message.py | 17 +++-- app/schemas/message.py | 5 ++ tests/test_agent_llm_capability.py | 69 ++++++++++++++++++++ tests/test_agent_tool_streaming.py | 76 ++++++++++++++++++++++ 5 files changed, 208 insertions(+), 23 deletions(-) diff --git a/app/agent/llm/capability.py b/app/agent/llm/capability.py index a406df73..d9ebae80 100644 --- a/app/agent/llm/capability.py +++ b/app/agent/llm/capability.py @@ -734,29 +734,61 @@ class AgentCapabilityManager: return cls.REPLY_MODE_TEXT @classmethod - def supports_native_voice_reply( - cls, channel: Optional[str], source: Optional[str] - ) -> bool: - """判断当前渠道是否支持原生语音消息发送。""" + def _parse_message_channel(cls, channel: Optional[Any]): + """将渠道入参归一化为消息渠道枚举。""" if not channel: + return None + + from app.schemas.types import MessageChannel + + if isinstance(channel, MessageChannel): + return channel + + channel_text = str(channel).strip() + if not channel_text: + return None + lowered_channel = channel_text.lower() + for channel_item in MessageChannel: + aliases = { + channel_item.value.lower(), + channel_item.name.lower(), + f"{MessageChannel.__name__}.{channel_item.name}".lower(), + } + if lowered_channel in aliases: + return channel_item + return None + + @staticmethod + def _is_wechat_app_mode(source: Optional[str]) -> bool: + """判断企业微信来源是否为自建应用模式。""" + if not source: return False from app.helper.service import ServiceConfigHelper - from app.schemas.types import MessageChannel - try: - channel_enum = MessageChannel(channel) - except (TypeError, ValueError): - return False - - if channel_enum == MessageChannel.Telegram: - return True - if channel_enum != MessageChannel.Wechat: - return False - - # 企业微信 bot 模式不支持发送语音,只有应用模式可用。 for config in ServiceConfigHelper.get_notification_configs(): if config.name != source: continue return (config.config or {}).get("WECHAT_MODE", "app") != "bot" return False + + @classmethod + def supports_native_voice_reply( + cls, channel: Optional[str], source: Optional[str] + ) -> bool: + """判断当前渠道是否支持原生语音消息发送。""" + from app.schemas.message import ChannelCapability, ChannelCapabilityManager + from app.schemas.types import MessageChannel + + channel_enum = cls._parse_message_channel(channel) + if not channel_enum: + return False + + if not ChannelCapabilityManager.supports_capability( + channel_enum, ChannelCapability.AUDIO_OUTPUT + ): + return False + + if channel_enum == MessageChannel.Wechat: + return cls._is_wechat_app_mode(source) + return True diff --git a/app/agent/tools/impl/send_voice_message.py b/app/agent/tools/impl/send_voice_message.py index 5bd1e423..a07ee8ec 100644 --- a/app/agent/tools/impl/send_voice_message.py +++ b/app/agent/tools/impl/send_voice_message.py @@ -15,8 +15,10 @@ from app.schemas import Notification, NotificationType class SendVoiceMessageInput(BaseModel): """发送语音消息工具输入。""" - explanation: Optional[str] = Field(None, - description="Clear explanation of why a voice reply is the best fit in the current context",) + explanation: Optional[str] = Field( + None, + description="Clear explanation of why a voice reply is the best fit in the current context", + ) message: str = Field( ..., description="The spoken content to send back to the user", @@ -24,6 +26,8 @@ class SendVoiceMessageInput(BaseModel): class SendVoiceMessageTool(MoviePilotTool): + """发送 Agent 语音回复的工具。""" + name: str = "send_voice_message" sends_message: bool = True description: str = ( @@ -36,12 +40,14 @@ class SendVoiceMessageTool(MoviePilotTool): require_admin: bool = False def get_tool_message(self, **kwargs) -> Optional[str]: + """生成语音回复工具的执行提示。""" message = kwargs.get("message") or "" if len(message) > 40: message = message[:40] + "..." return f"发送语音回复: {message}" async def run(self, message: str, **kwargs) -> str: + """合成语音并发送到当前对话渠道,不支持时回退为文字。""" if not message: return "语音回复内容不能为空" @@ -69,11 +75,8 @@ class SendVoiceMessageTool(MoviePilotTool): fallback_reason = "当前未配置可用的语音合成能力" logger.info( - "执行工具: %s, channel=%s, use_voice=%s, text_len=%s", - self.name, - channel, - used_voice, - len(message), + f"执行工具: {self.name}, channel={channel}, " + f"use_voice={used_voice}, text_len={len(message)}" ) await ToolChain().async_post_message( diff --git a/app/schemas/message.py b/app/schemas/message.py index 64e365ab..edae1760 100644 --- a/app/schemas/message.py +++ b/app/schemas/message.py @@ -273,6 +273,8 @@ class ChannelCapability(Enum): IMAGES = "images" # 支持链接 LINKS = "links" + # 支持原生语音输出 + AUDIO_OUTPUT = "audio_output" # 支持文件发送 FILE_SENDING = "file_sending" # 支持可收口的消息处理状态提示,如 reaction 或 typing @@ -313,6 +315,7 @@ class ChannelCapabilityManager: ChannelCapability.RICH_TEXT, ChannelCapability.IMAGES, ChannelCapability.LINKS, + ChannelCapability.AUDIO_OUTPUT, ChannelCapability.FILE_SENDING, ChannelCapability.PROCESSING_STATUS, }, @@ -327,6 +330,7 @@ class ChannelCapabilityManager: capabilities={ ChannelCapability.IMAGES, ChannelCapability.LINKS, + ChannelCapability.AUDIO_OUTPUT, ChannelCapability.MENU_COMMANDS, }, fallback_enabled=True, @@ -341,6 +345,7 @@ class ChannelCapabilityManager: ChannelCapability.RICH_TEXT, ChannelCapability.IMAGES, ChannelCapability.LINKS, + ChannelCapability.AUDIO_OUTPUT, ChannelCapability.FILE_SENDING, ChannelCapability.PROCESSING_STATUS, }, diff --git a/tests/test_agent_llm_capability.py b/tests/test_agent_llm_capability.py index 6b58e4a1..11e4eb5d 100644 --- a/tests/test_agent_llm_capability.py +++ b/tests/test_agent_llm_capability.py @@ -11,6 +11,8 @@ sys.modules.setdefault("psutil", Mock()) sys.modules.setdefault("pyquery", Mock()) from app.core.config import settings +from app.schemas.message import ChannelCapability, ChannelCapabilityManager +from app.schemas.types import MessageChannel module_path = Path(__file__).resolve().parents[1] / "app" / "agent" / "llm" / "capability.py" spec = importlib.util.spec_from_file_location("test_agent_llm_capability_module", module_path) @@ -157,6 +159,73 @@ class AgentCapabilityManagerTest(unittest.TestCase): self.assertEqual(result, Path("/tmp/reply.opus")) provider.synthesize_speech.assert_called_once_with(text="你好") + def test_native_voice_reply_supports_channels_with_audio_output(self): + """校验 Agent 语音回复渠道支持判断覆盖常见渠道写法。""" + self.assertTrue( + AgentCapabilityManager.supports_native_voice_reply("telegram", None) + ) + self.assertTrue( + AgentCapabilityManager.supports_native_voice_reply( + MessageChannel.Telegram.value, None + ) + ) + self.assertTrue( + AgentCapabilityManager.supports_native_voice_reply( + MessageChannel.Feishu.value, None + ) + ) + self.assertTrue( + AgentCapabilityManager.supports_native_voice_reply("Feishu", None) + ) + self.assertFalse( + AgentCapabilityManager.supports_native_voice_reply("Slack", None) + ) + + def test_native_voice_reply_respects_wechat_mode(self): + """校验企业微信只有自建应用模式允许 Agent 语音回复。""" + configs = [ + SimpleNamespace(name="wechat-app", config={"WECHAT_MODE": "app"}), + SimpleNamespace(name="wechat-bot", config={"WECHAT_MODE": "bot"}), + ] + + with patch( + "app.helper.service.ServiceConfigHelper.get_notification_configs", + return_value=configs, + ): + self.assertTrue( + AgentCapabilityManager.supports_native_voice_reply( + MessageChannel.Wechat.value, "wechat-app" + ) + ) + self.assertFalse( + AgentCapabilityManager.supports_native_voice_reply( + MessageChannel.Wechat.value, "wechat-bot" + ) + ) + self.assertFalse( + AgentCapabilityManager.supports_native_voice_reply( + MessageChannel.Wechat.value, "missing" + ) + ) + + def test_channel_capability_marks_voice_output_channels(self): + """校验消息渠道能力显式声明原生语音输出支持。""" + for channel in ( + MessageChannel.Telegram, + MessageChannel.Feishu, + MessageChannel.Wechat, + ): + self.assertTrue( + ChannelCapabilityManager.supports_capability( + channel, ChannelCapability.AUDIO_OUTPUT + ) + ) + self.assertFalse( + ChannelCapabilityManager.supports_capability( + MessageChannel.Slack, ChannelCapability.AUDIO_OUTPUT + ) + ) + def test_mimo_tts_uses_chat_completions_audio_payload(self): provider = MiMoAudioProvider() fake_client = Mock() diff --git a/tests/test_agent_tool_streaming.py b/tests/test_agent_tool_streaming.py index 1a13a998..52021815 100644 --- a/tests/test_agent_tool_streaming.py +++ b/tests/test_agent_tool_streaming.py @@ -1,5 +1,6 @@ import asyncio import unittest +from pathlib import Path from unittest.mock import AsyncMock, patch import langchain.agents as langchain_agents @@ -9,6 +10,7 @@ if not hasattr(langchain_agents, "create_agent"): from app.agent.callback import StreamingHandler from app.agent.tools.base import MoviePilotTool +from app.agent.tools.impl.send_voice_message import SendVoiceMessageTool from app.api.endpoints.openai import _OpenAIStreamingHandler from app.core.config import settings from app.schemas.message import MessageResponse @@ -397,6 +399,80 @@ class TestAgentToolStreaming(unittest.TestCase): send_tool_message.assert_awaited_once_with("前置内容\n\n⚙️ => run test tool") self.assertEqual(buffered_message, "") + def test_send_voice_message_uses_native_voice_for_supported_channels(self): + """校验支持语音输出的渠道会发送原生语音消息。""" + + async def _run(channel: MessageChannel): + """运行指定渠道的语音发送工具。""" + tool = SendVoiceMessageTool(session_id="session-1", user_id="10001") + tool.set_message_attr( + channel=channel.value, source=f"{channel.name.lower()}-main", username="tester" + ) + + with ( + patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True), + patch.object(settings, "AUDIO_OUTPUT_INCLUDE_TEXT", True), + patch( + "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.is_audio_output_available", + return_value=True, + ), + patch( + "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.synthesize_speech", + return_value=Path("/tmp/reply.opus"), + ) as synthesize_speech, + patch( + "app.agent.tools.impl.send_voice_message.ToolChain.async_post_message", + new_callable=AsyncMock, + ) as async_post_message, + ): + result = await tool.run("你好") + return result, synthesize_speech, async_post_message + + for channel in (MessageChannel.Telegram, MessageChannel.Feishu): + result, synthesize_speech, async_post_message = asyncio.run(_run(channel)) + notification = async_post_message.await_args.args[0] + + self.assertEqual(result, "语音回复已发送") + synthesize_speech.assert_called_once_with("你好") + self.assertEqual(notification.channel, channel) + self.assertEqual(notification.voice_path, "/tmp/reply.opus") + self.assertEqual(notification.voice_caption, "你好") + + def test_send_voice_message_falls_back_for_unsupported_channels(self): + """校验不支持语音输出的渠道继续回退为文字消息。""" + + async def _run(): + """运行不支持语音输出渠道的语音发送工具。""" + tool = SendVoiceMessageTool(session_id="session-1", user_id="10001") + tool.set_message_attr( + channel=MessageChannel.Slack.value, source="slack-main", username="tester" + ) + + with ( + patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True), + patch( + "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.is_audio_output_available", + return_value=True, + ), + patch( + "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.synthesize_speech" + ) as synthesize_speech, + patch( + "app.agent.tools.impl.send_voice_message.ToolChain.async_post_message", + new_callable=AsyncMock, + ) as async_post_message, + ): + result = await tool.run("你好") + return result, synthesize_speech, async_post_message + + result, synthesize_speech, async_post_message = asyncio.run(_run()) + notification = async_post_message.await_args.args[0] + + self.assertEqual(result, "当前渠道不支持语音回复,已自动回退为文字回复") + synthesize_speech.assert_not_called() + self.assertEqual(notification.text, "你好") + self.assertIsNone(notification.voice_path) + if __name__ == "__main__": unittest.main()