diff --git a/app/agent/__init__.py b/app/agent/__init__.py index d606b9e5..f098afa0 100644 --- a/app/agent/__init__.py +++ b/app/agent/__init__.py @@ -861,6 +861,7 @@ class MoviePilotAgent: message: str, images: List[str] = None, files: Optional[List[dict]] = None, + has_audio_input: bool = False, ) -> str: """ 处理用户消息,流式推理并返回 Agent 回复 @@ -868,7 +869,8 @@ class MoviePilotAgent: try: logger.info( f"Agent推理: session_id={self.session_id}, input={message}, " - f"images={len(images) if images else 0}, files={len(files) if files else 0}" + f"images={len(images) if images else 0}, files={len(files) if files else 0}, " + f"audio_input={has_audio_input}" ) self._tool_context = { "user_reply_sent": False, @@ -885,6 +887,10 @@ class MoviePilotAgent: # 构建结构化用户消息内容 request_payload = { "message": message or "", + "input": { + "mode": "voice" if has_audio_input else "text", + "transcribed": bool(has_audio_input), + }, "images": [ {"index": index + 1, "type": "image"} for index, _ in enumerate(images or []) @@ -1187,6 +1193,7 @@ class _MessageTask: message: str images: Optional[List[str]] = None files: Optional[List[dict]] = None + has_audio_input: bool = False channel: Optional[str] = None source: Optional[str] = None username: Optional[str] = None @@ -1333,6 +1340,7 @@ class AgentManager: message: str, images: List[str] = None, files: Optional[List[dict]] = None, + has_audio_input: bool = False, channel: str = None, source: str = None, username: str = None, @@ -1352,6 +1360,7 @@ class AgentManager: message=message, images=images, files=files, + has_audio_input=has_audio_input, channel=channel, source=source, username=username, @@ -1488,7 +1497,13 @@ class AgentManager: agent.persist_output_message = task.persist_output_message agent.allow_message_tools = task.allow_message_tools - return await agent.process(task.message, images=task.images, files=task.files) + process_kwargs = { + "images": task.images, + "files": task.files, + } + if task.has_audio_input: + process_kwargs["has_audio_input"] = True + return await agent.process(task.message, **process_kwargs) async def stop_current_task(self, session_id: str): """ diff --git a/app/agent/prompt/System Core Prompt.txt b/app/agent/prompt/System Core Prompt.txt index 54062653..feb3d408 100644 --- a/app/agent/prompt/System Core Prompt.txt +++ b/app/agent/prompt/System Core Prompt.txt @@ -35,7 +35,7 @@ You act as a proactive agent. Your goal is to fully resolve the user's media-rel - Treat manual download and subscription automation as two execution modes of the same acquisition pipeline. Manual download is user-triggered immediate acquisition; subscription is persistent site-driven monitoring and acquisition. - Keep the user anchored to the operational step that matters now: site, search, recognition, download, subscription, transfer, or status/history. - Users may attach images from supported channels; analyze them together with the text when relevant. -- User messages may arrive as structured JSON. Treat the `message` field as the user's text. Attachments appear in `files`; when `local_path` is present, use local file tools to inspect the uploaded file directly. When image input is disabled for the current model, user images may also be delivered through `files`. +- User messages may arrive as structured JSON. Treat the `message` field as the user's text. Input metadata appears in `input`; when `input.mode` is `voice`, the user sent a voice message and `message` contains its transcript. Attachments appear in `files`; when `local_path` is present, use local file tools to inspect the uploaded file directly. When image input is disabled for the current model, user images may also be delivered through `files`. diff --git a/app/agent/prompt/__init__.py b/app/agent/prompt/__init__.py index 054f454e..30d29c9a 100644 --- a/app/agent/prompt/__init__.py +++ b/app/agent/prompt/__init__.py @@ -396,7 +396,12 @@ class PromptManager: return ( "Use normal text replies by default. Only call `send_voice_message` " "when the user explicitly asks for a voice reply or spoken playback " - "is clearly better than plain text." + "is clearly better than plain text. `send_voice_message` is a terminal " + "response tool: put the complete user-facing reply in its `message` " + "argument, then stop the turn. Do not also call `send_message`, do not " + "write a final text reply after it, and do not repeat the same content " + "as plain text. If native voice is unavailable, the tool sends the same " + "content as a text fallback and still completes the reply." ) @staticmethod @@ -410,9 +415,11 @@ class PromptManager: ): return ( "- User questions: If you need the user to choose from a few clear options, " - "call `ask_user_choice` to send button options. After the user clicks a button, " - "the selected value will come back as the user's next message. After calling this tool, " - "wait for the user's selection instead of repeating the question in plain text." + "call `ask_user_choice` to send button options. `ask_user_choice` is a terminal " + "interaction tool: put the full question and all options in the tool call, then " + "stop the turn and wait for the user's selection. The selected value will come back " + "as the user's next message. Do not also call `send_message`, do not write a final " + "text reply after it, and do not repeat the question in plain text." ) return "- User questions: When you truly need user input, ask briefly in plain text." diff --git a/app/agent/tools/impl/ask_user_choice.py b/app/agent/tools/impl/ask_user_choice.py index ccc589c0..3048ca54 100644 --- a/app/agent/tools/impl/ask_user_choice.py +++ b/app/agent/tools/impl/ask_user_choice.py @@ -71,7 +71,9 @@ class AskUserChoiceTool(MoviePilotTool): return_direct: bool = True description: str = ( "Ask the user to choose from button options on channels that support interactive buttons. " - "After the user clicks a button, the selected value will come back as the user's next message." + "This is a terminal interaction tool: put the full question and all options in this call, " + "then stop the current turn. After the user clicks a button, the selected value will come " + "back as the user's next message. Do not also send the same question as plain text." ) args_schema: Type[BaseModel] = AskUserChoiceInput require_admin: bool = False diff --git a/app/agent/tools/impl/send_voice_message.py b/app/agent/tools/impl/send_voice_message.py index b4f27d15..1012fda6 100644 --- a/app/agent/tools/impl/send_voice_message.py +++ b/app/agent/tools/impl/send_voice_message.py @@ -35,7 +35,9 @@ class SendVoiceMessageTool(MoviePilotTool): "Send a voice reply to the current user. Use this only when the user explicitly asks for " "a voice reply or when spoken playback is clearly better than plain text. On channels " "without voice support or when TTS is unavailable, it automatically falls back to sending " - "the same content as plain text." + "the same content as plain text. This is a terminal response tool: put the complete " + "user-facing reply in `message`; after this tool runs, do not send another text reply " + "or call `send_message` with the same content." ) args_schema: Type[BaseModel] = SendVoiceMessageInput require_admin: bool = False diff --git a/app/chain/message.py b/app/chain/message.py index d3825823..07682cf1 100644 --- a/app/chain/message.py +++ b/app/chain/message.py @@ -350,6 +350,7 @@ class MessageChain(ChainBase): original_chat_id=original_chat_id, images=images, files=files, + has_audio_input=has_audio_input, ) if ( @@ -366,6 +367,7 @@ class MessageChain(ChainBase): original_chat_id=original_chat_id, images=images, files=files, + has_audio_input=has_audio_input, ) if MediaInteractionChain().handle_text_interaction( @@ -1204,6 +1206,7 @@ class MessageChain(ChainBase): images: Optional[List[CommingMessage.MessageImage]] = None, files: Optional[List[CommingMessage.MessageAttachment]] = None, session_id: Optional[str] = None, + has_audio_input: bool = False, ) -> bool: """ 处理AI智能体消息 @@ -1317,6 +1320,8 @@ class MessageChain(ChainBase): else None, "original_chat_id": original_chat_id, } + if has_audio_input: + process_kwargs["has_audio_input"] = True # 在事件循环中处理 asyncio.run_coroutine_threadsafe( agent_manager.process_message(**process_kwargs), diff --git a/tests/test_agent_background_output.py b/tests/test_agent_background_output.py index 3e76aa43..adac5792 100644 --- a/tests/test_agent_background_output.py +++ b/tests/test_agent_background_output.py @@ -10,6 +10,7 @@ from app.agent import ( AgentManager, ReplyMode, UNSUPPORTED_IMAGE_INPUT_MESSAGE, + _MessageTask, ) from app.agent.memory import memory_manager from app.agent.tools.factory import MoviePilotToolFactory @@ -288,6 +289,28 @@ class AgentBackgroundOutputTest(unittest.IsolatedAsyncioTestCase): process_message.assert_not_awaited() + async def test_agent_manager_preserves_voice_input_flag(self): + """会话队列执行时应把语音输入标记继续传给 Agent。""" + manager = AgentManager() + agent = MoviePilotAgent(session_id="session-1", user_id="user-1") + manager.active_agents["session-1"] = agent + agent.process = AsyncMock(return_value="ok") + task = _MessageTask( + session_id="session-1", + user_id="user-1", + message="帮我推荐一部电影", + has_audio_input=True, + ) + + await manager._process_message_internal(task) + + agent.process.assert_awaited_once_with( + "帮我推荐一部电影", + images=None, + files=None, + has_audio_input=True, + ) + async def test_create_agent_excludes_activity_log_for_heartbeat_session(self): agent = MoviePilotAgent( session_id=f"{HEARTBEAT_SESSION_PREFIX}test__", diff --git a/tests/test_agent_image_support.py b/tests/test_agent_image_support.py index fababa6d..849add88 100644 --- a/tests/test_agent_image_support.py +++ b/tests/test_agent_image_support.py @@ -242,6 +242,7 @@ class AgentImageSupportTest(unittest.TestCase): handle_ai_message.assert_called_once() self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影") + self.assertTrue(handle_ai_message.call_args.kwargs["has_audio_input"]) self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs) def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self): @@ -390,8 +391,36 @@ class AgentImageSupportTest(unittest.TestCase): self.assertIsInstance(content, list) payload = json.loads(content[0]["text"]) self.assertEqual(payload["message"], "帮我总结这个文件") + self.assertEqual(payload["input"]["mode"], "text") + self.assertFalse(payload["input"]["transcribed"]) self.assertEqual(payload["files"][0]["local_path"], "/tmp/report.txt") + def test_agent_process_marks_voice_input_in_structured_json(self): + """语音输入应在结构化消息中标记为转写来源。""" + agent = MoviePilotAgent( + session_id="session-1", + user_id="user-1", + channel=MessageChannel.Telegram.value, + source="telegram-test", + username="tester", + ) + + with patch( + "app.agent.memory.memory_manager.get_agent_messages", return_value=[] + ), patch.object(agent, "_execute_agent", new_callable=AsyncMock) as execute_agent: + asyncio.run( + agent.process( + "帮我推荐一部电影", + has_audio_input=True, + ) + ) + + messages = execute_agent.await_args.args[0] + payload = json.loads(messages[-1].content[0]["text"]) + self.assertEqual(payload["message"], "帮我推荐一部电影") + self.assertEqual(payload["input"]["mode"], "voice") + self.assertTrue(payload["input"]["transcribed"]) + def test_llm_supports_image_input_respects_explicit_override(self): with patch.object(settings, "LLM_SUPPORT_IMAGE_INPUT", False): self.assertFalse(LLMHelper.supports_image_input()) @@ -447,6 +476,29 @@ class AgentImageSupportTest(unittest.TestCase): "/tmp/image_1.jpg", ) + def test_handle_ai_message_forwards_voice_input_to_agent_manager(self): + """AI消息入队时应保留语音输入标记。""" + chain = MessageChain() + + with patch.object(settings, "AI_AGENT_ENABLE", True), patch.object( + chain, "_get_or_create_session_id", return_value="session-1" + ), patch( + "app.chain.message.agent_manager.process_message", new_callable=AsyncMock + ) as process_message, patch( + "app.chain.message.asyncio.run_coroutine_threadsafe", + side_effect=lambda coro, _loop: (coro.close(), Mock())[1], + ): + chain._handle_ai_message( + text="帮我推荐一部电影", + channel=MessageChannel.Telegram, + source="telegram-test", + userid="10001", + username="tester", + has_audio_input=True, + ) + + self.assertTrue(process_message.call_args.kwargs["has_audio_input"]) + def test_slack_images_use_authenticated_data_url_download(self): chain = MessageChain() diff --git a/tests/test_agent_interaction.py b/tests/test_agent_interaction.py index 67558699..589e46c2 100644 --- a/tests/test_agent_interaction.py +++ b/tests/test_agent_interaction.py @@ -30,6 +30,8 @@ class TestAgentInteraction(unittest.TestCase): ) self.assertIn("ask_user_choice", telegram_prompt) + self.assertIn("terminal interaction tool", telegram_prompt) + self.assertIn("do not write a final text reply after it", telegram_prompt) self.assertNotIn("ask_user_choice", wechat_prompt) def test_factory_injects_choice_tool_only_for_button_channels(self): @@ -60,6 +62,7 @@ class TestAgentInteraction(unittest.TestCase): tool = AskUserChoiceTool(session_id="session-1", user_id="10001") self.assertTrue(tool.return_direct) + self.assertIn("terminal interaction tool", tool.description) def test_choice_tool_sends_buttons_and_registers_pending_request(self): tool = AskUserChoiceTool(session_id="session-1", user_id="10001") diff --git a/tests/test_agent_prompt_style.py b/tests/test_agent_prompt_style.py index f023575b..3f4f2a06 100644 --- a/tests/test_agent_prompt_style.py +++ b/tests/test_agent_prompt_style.py @@ -244,6 +244,24 @@ class TestAgentPromptStyle(unittest.TestCase): prompt, ) + def test_voice_prompt_marks_voice_tool_as_terminal_reply(self): + """语音回复提示词应说明语音工具会结束当前轮次。""" + with patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True): + prompt = prompt_manager.get_agent_prompt() + + self.assertIn("send_voice_message", prompt) + self.assertIn("terminal response tool", prompt) + self.assertIn("do not write a final text reply after it", prompt) + self.assertIn("text fallback and still completes the reply", prompt) + + def test_core_prompt_describes_voice_input_metadata(self): + """核心提示词应说明结构化消息中的语音输入元信息。""" + prompt = prompt_manager.get_agent_prompt() + + self.assertIn("input.mode", prompt) + self.assertIn("voice", prompt) + self.assertIn("`message` contains its transcript", prompt) + def test_verbose_prompt_does_not_inject_silence_until_tools_finish_rule(self): with patch.object(settings, "AI_AGENT_VERBOSE", True): prompt = prompt_manager.get_agent_prompt() diff --git a/tests/test_agent_tool_streaming.py b/tests/test_agent_tool_streaming.py index db57df2f..84c1a0a9 100644 --- a/tests/test_agent_tool_streaming.py +++ b/tests/test_agent_tool_streaming.py @@ -441,7 +441,9 @@ class TestAgentToolStreaming(unittest.TestCase): self.assertEqual(notification.channel, channel) self.assertEqual(notification.voice_path, "/tmp/reply.opus") self.assertEqual(notification.voice_caption, "你好") - self.assertTrue(SendVoiceMessageTool.return_direct) + voice_tool = SendVoiceMessageTool(session_id="session-1", user_id="10001") + self.assertTrue(voice_tool.return_direct) + self.assertIn("terminal response tool", voice_tool.description) def test_send_voice_message_falls_back_for_unsupported_channels(self): """校验不支持语音输出的渠道继续回退为文字消息。"""