From 896631d63e4c5b50fa74900f7ba4ebb59acfbf92 Mon Sep 17 00:00:00 2001
From: jxxghp <jxxghp@gmail.com>
Date: Tue, 26 May 2026 20:14:56 +0800
Subject: [PATCH] fix(agent): enable voice replies for supported channels

---
 app/agent/llm/capability.py                | 64 +++++++++++++-----
 app/agent/tools/impl/send_voice_message.py | 17 +++--
 app/schemas/message.py                     |  5 ++
 tests/test_agent_llm_capability.py         | 69 ++++++++++++++++++++
 tests/test_agent_tool_streaming.py         | 76 ++++++++++++++++++++++
 5 files changed, 208 insertions(+), 23 deletions(-)

diff --git a/app/agent/llm/capability.py b/app/agent/llm/capability.py
index a406df73..d9ebae80 100644
--- a/app/agent/llm/capability.py
+++ b/app/agent/llm/capability.py
@@ -734,29 +734,61 @@ class AgentCapabilityManager:
         return cls.REPLY_MODE_TEXT
 
     @classmethod
-    def supports_native_voice_reply(
-        cls, channel: Optional[str], source: Optional[str]
-    ) -> bool:
-        """判断当前渠道是否支持原生语音消息发送。"""
+    def _parse_message_channel(cls, channel: Optional[Any]):
+        """将渠道入参归一化为消息渠道枚举。"""
         if not channel:
+            return None
+
+        from app.schemas.types import MessageChannel
+
+        if isinstance(channel, MessageChannel):
+            return channel
+
+        channel_text = str(channel).strip()
+        if not channel_text:
+            return None
+        lowered_channel = channel_text.lower()
+        for channel_item in MessageChannel:
+            aliases = {
+                channel_item.value.lower(),
+                channel_item.name.lower(),
+                f"{MessageChannel.__name__}.{channel_item.name}".lower(),
+            }
+            if lowered_channel in aliases:
+                return channel_item
+        return None
+
+    @staticmethod
+    def _is_wechat_app_mode(source: Optional[str]) -> bool:
+        """判断企业微信来源是否为自建应用模式。"""
+        if not source:
             return False
 
         from app.helper.service import ServiceConfigHelper
-        from app.schemas.types import MessageChannel
 
-        try:
-            channel_enum = MessageChannel(channel)
-        except (TypeError, ValueError):
-            return False
-
-        if channel_enum == MessageChannel.Telegram:
-            return True
-        if channel_enum != MessageChannel.Wechat:
-            return False
-
-        # 企业微信 bot 模式不支持发送语音，只有应用模式可用。
         for config in ServiceConfigHelper.get_notification_configs():
             if config.name != source:
                 continue
             return (config.config or {}).get("WECHAT_MODE", "app") != "bot"
         return False
+
+    @classmethod
+    def supports_native_voice_reply(
+            cls, channel: Optional[str], source: Optional[str]
+    ) -> bool:
+        """判断当前渠道是否支持原生语音消息发送。"""
+        from app.schemas.message import ChannelCapability, ChannelCapabilityManager
+        from app.schemas.types import MessageChannel
+
+        channel_enum = cls._parse_message_channel(channel)
+        if not channel_enum:
+            return False
+
+        if not ChannelCapabilityManager.supports_capability(
+                channel_enum, ChannelCapability.AUDIO_OUTPUT
+        ):
+            return False
+
+        if channel_enum == MessageChannel.Wechat:
+            return cls._is_wechat_app_mode(source)
+        return True
diff --git a/app/agent/tools/impl/send_voice_message.py b/app/agent/tools/impl/send_voice_message.py
index 5bd1e423..a07ee8ec 100644
--- a/app/agent/tools/impl/send_voice_message.py
+++ b/app/agent/tools/impl/send_voice_message.py
@@ -15,8 +15,10 @@ from app.schemas import Notification, NotificationType
 class SendVoiceMessageInput(BaseModel):
     """发送语音消息工具输入。"""
 
-    explanation: Optional[str] = Field(None,
-        description="Clear explanation of why a voice reply is the best fit in the current context",)
+    explanation: Optional[str] = Field(
+        None,
+        description="Clear explanation of why a voice reply is the best fit in the current context",
+    )
     message: str = Field(
         ...,
         description="The spoken content to send back to the user",
@@ -24,6 +26,8 @@ class SendVoiceMessageInput(BaseModel):
 
 
 class SendVoiceMessageTool(MoviePilotTool):
+    """发送 Agent 语音回复的工具。"""
+
     name: str = "send_voice_message"
     sends_message: bool = True
     description: str = (
@@ -36,12 +40,14 @@ class SendVoiceMessageTool(MoviePilotTool):
     require_admin: bool = False
 
     def get_tool_message(self, **kwargs) -> Optional[str]:
+        """生成语音回复工具的执行提示。"""
         message = kwargs.get("message") or ""
         if len(message) > 40:
             message = message[:40] + "..."
         return f"发送语音回复: {message}"
 
     async def run(self, message: str, **kwargs) -> str:
+        """合成语音并发送到当前对话渠道，不支持时回退为文字。"""
         if not message:
             return "语音回复内容不能为空"
 
@@ -69,11 +75,8 @@ class SendVoiceMessageTool(MoviePilotTool):
             fallback_reason = "当前未配置可用的语音合成能力"
 
         logger.info(
-            "执行工具: %s, channel=%s, use_voice=%s, text_len=%s",
-            self.name,
-            channel,
-            used_voice,
-            len(message),
+            f"执行工具: {self.name}, channel={channel}, "
+            f"use_voice={used_voice}, text_len={len(message)}"
         )
 
         await ToolChain().async_post_message(
diff --git a/app/schemas/message.py b/app/schemas/message.py
index 64e365ab..edae1760 100644
--- a/app/schemas/message.py
+++ b/app/schemas/message.py
@@ -273,6 +273,8 @@ class ChannelCapability(Enum):
     IMAGES = "images"
     # 支持链接
     LINKS = "links"
+    # 支持原生语音输出
+    AUDIO_OUTPUT = "audio_output"
     # 支持文件发送
     FILE_SENDING = "file_sending"
     # 支持可收口的消息处理状态提示，如 reaction 或 typing
@@ -313,6 +315,7 @@ class ChannelCapabilityManager:
                 ChannelCapability.RICH_TEXT,
                 ChannelCapability.IMAGES,
                 ChannelCapability.LINKS,
+                ChannelCapability.AUDIO_OUTPUT,
                 ChannelCapability.FILE_SENDING,
                 ChannelCapability.PROCESSING_STATUS,
             },
@@ -327,6 +330,7 @@ class ChannelCapabilityManager:
             capabilities={
                 ChannelCapability.IMAGES,
                 ChannelCapability.LINKS,
+                ChannelCapability.AUDIO_OUTPUT,
                 ChannelCapability.MENU_COMMANDS,
             },
             fallback_enabled=True,
@@ -341,6 +345,7 @@ class ChannelCapabilityManager:
                 ChannelCapability.RICH_TEXT,
                 ChannelCapability.IMAGES,
                 ChannelCapability.LINKS,
+                ChannelCapability.AUDIO_OUTPUT,
                 ChannelCapability.FILE_SENDING,
                 ChannelCapability.PROCESSING_STATUS,
             },
diff --git a/tests/test_agent_llm_capability.py b/tests/test_agent_llm_capability.py
index 6b58e4a1..11e4eb5d 100644
--- a/tests/test_agent_llm_capability.py
+++ b/tests/test_agent_llm_capability.py
@@ -11,6 +11,8 @@ sys.modules.setdefault("psutil", Mock())
 sys.modules.setdefault("pyquery", Mock())
 
 from app.core.config import settings
+from app.schemas.message import ChannelCapability, ChannelCapabilityManager
+from app.schemas.types import MessageChannel
 
 module_path = Path(__file__).resolve().parents[1] / "app" / "agent" / "llm" / "capability.py"
 spec = importlib.util.spec_from_file_location("test_agent_llm_capability_module", module_path)
@@ -157,6 +159,73 @@ class AgentCapabilityManagerTest(unittest.TestCase):
         self.assertEqual(result, Path("/tmp/reply.opus"))
         provider.synthesize_speech.assert_called_once_with(text="你好")
 
+    def test_native_voice_reply_supports_channels_with_audio_output(self):
+        """校验 Agent 语音回复渠道支持判断覆盖常见渠道写法。"""
+        self.assertTrue(
+            AgentCapabilityManager.supports_native_voice_reply("telegram", None)
+        )
+        self.assertTrue(
+            AgentCapabilityManager.supports_native_voice_reply(
+                MessageChannel.Telegram.value, None
+            )
+        )
+        self.assertTrue(
+            AgentCapabilityManager.supports_native_voice_reply(
+                MessageChannel.Feishu.value, None
+            )
+        )
+        self.assertTrue(
+            AgentCapabilityManager.supports_native_voice_reply("Feishu", None)
+        )
+        self.assertFalse(
+            AgentCapabilityManager.supports_native_voice_reply("Slack", None)
+        )
+
+    def test_native_voice_reply_respects_wechat_mode(self):
+        """校验企业微信只有自建应用模式允许 Agent 语音回复。"""
+        configs = [
+            SimpleNamespace(name="wechat-app", config={"WECHAT_MODE": "app"}),
+            SimpleNamespace(name="wechat-bot", config={"WECHAT_MODE": "bot"}),
+        ]
+
+        with patch(
+            "app.helper.service.ServiceConfigHelper.get_notification_configs",
+            return_value=configs,
+        ):
+            self.assertTrue(
+                AgentCapabilityManager.supports_native_voice_reply(
+                    MessageChannel.Wechat.value, "wechat-app"
+                )
+            )
+            self.assertFalse(
+                AgentCapabilityManager.supports_native_voice_reply(
+                    MessageChannel.Wechat.value, "wechat-bot"
+                )
+            )
+            self.assertFalse(
+                AgentCapabilityManager.supports_native_voice_reply(
+                    MessageChannel.Wechat.value, "missing"
+                )
+            )
+
+    def test_channel_capability_marks_voice_output_channels(self):
+        """校验消息渠道能力显式声明原生语音输出支持。"""
+        for channel in (
+            MessageChannel.Telegram,
+            MessageChannel.Feishu,
+            MessageChannel.Wechat,
+        ):
+            self.assertTrue(
+                ChannelCapabilityManager.supports_capability(
+                    channel, ChannelCapability.AUDIO_OUTPUT
+                )
+            )
+        self.assertFalse(
+            ChannelCapabilityManager.supports_capability(
+                MessageChannel.Slack, ChannelCapability.AUDIO_OUTPUT
+            )
+        )
+
     def test_mimo_tts_uses_chat_completions_audio_payload(self):
         provider = MiMoAudioProvider()
         fake_client = Mock()
diff --git a/tests/test_agent_tool_streaming.py b/tests/test_agent_tool_streaming.py
index 1a13a998..52021815 100644
--- a/tests/test_agent_tool_streaming.py
+++ b/tests/test_agent_tool_streaming.py
@@ -1,5 +1,6 @@
 import asyncio
 import unittest
+from pathlib import Path
 from unittest.mock import AsyncMock, patch
 
 import langchain.agents as langchain_agents
@@ -9,6 +10,7 @@ if not hasattr(langchain_agents, "create_agent"):
 
 from app.agent.callback import StreamingHandler
 from app.agent.tools.base import MoviePilotTool
+from app.agent.tools.impl.send_voice_message import SendVoiceMessageTool
 from app.api.endpoints.openai import _OpenAIStreamingHandler
 from app.core.config import settings
 from app.schemas.message import MessageResponse
@@ -397,6 +399,80 @@ class TestAgentToolStreaming(unittest.TestCase):
         send_tool_message.assert_awaited_once_with("前置内容\n\n⚙️ => run test tool")
         self.assertEqual(buffered_message, "")
 
+    def test_send_voice_message_uses_native_voice_for_supported_channels(self):
+        """校验支持语音输出的渠道会发送原生语音消息。"""
+
+        async def _run(channel: MessageChannel):
+            """运行指定渠道的语音发送工具。"""
+            tool = SendVoiceMessageTool(session_id="session-1", user_id="10001")
+            tool.set_message_attr(
+                channel=channel.value, source=f"{channel.name.lower()}-main", username="tester"
+            )
+
+            with (
+                patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True),
+                patch.object(settings, "AUDIO_OUTPUT_INCLUDE_TEXT", True),
+                patch(
+                    "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.is_audio_output_available",
+                    return_value=True,
+                ),
+                patch(
+                    "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.synthesize_speech",
+                    return_value=Path("/tmp/reply.opus"),
+                ) as synthesize_speech,
+                patch(
+                    "app.agent.tools.impl.send_voice_message.ToolChain.async_post_message",
+                    new_callable=AsyncMock,
+                ) as async_post_message,
+            ):
+                result = await tool.run("你好")
+            return result, synthesize_speech, async_post_message
+
+        for channel in (MessageChannel.Telegram, MessageChannel.Feishu):
+            result, synthesize_speech, async_post_message = asyncio.run(_run(channel))
+            notification = async_post_message.await_args.args[0]
+
+            self.assertEqual(result, "语音回复已发送")
+            synthesize_speech.assert_called_once_with("你好")
+            self.assertEqual(notification.channel, channel)
+            self.assertEqual(notification.voice_path, "/tmp/reply.opus")
+            self.assertEqual(notification.voice_caption, "你好")
+
+    def test_send_voice_message_falls_back_for_unsupported_channels(self):
+        """校验不支持语音输出的渠道继续回退为文字消息。"""
+
+        async def _run():
+            """运行不支持语音输出渠道的语音发送工具。"""
+            tool = SendVoiceMessageTool(session_id="session-1", user_id="10001")
+            tool.set_message_attr(
+                channel=MessageChannel.Slack.value, source="slack-main", username="tester"
+            )
+
+            with (
+                patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True),
+                patch(
+                    "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.is_audio_output_available",
+                    return_value=True,
+                ),
+                patch(
+                    "app.agent.tools.impl.send_voice_message.AgentCapabilityManager.synthesize_speech"
+                ) as synthesize_speech,
+                patch(
+                    "app.agent.tools.impl.send_voice_message.ToolChain.async_post_message",
+                    new_callable=AsyncMock,
+                ) as async_post_message,
+            ):
+                result = await tool.run("你好")
+            return result, synthesize_speech, async_post_message
+
+        result, synthesize_speech, async_post_message = asyncio.run(_run())
+        notification = async_post_message.await_args.args[0]
+
+        self.assertEqual(result, "当前渠道不支持语音回复，已自动回退为文字回复")
+        synthesize_speech.assert_not_called()
+        self.assertEqual(notification.text, "你好")
+        self.assertIsNone(notification.voice_path)
+
 
 if __name__ == "__main__":
     unittest.main()