diff --git a/app/agent/__init__.py b/app/agent/__init__.py
index d606b9e5..f098afa0 100644
--- a/app/agent/__init__.py
+++ b/app/agent/__init__.py
@@ -861,6 +861,7 @@ class MoviePilotAgent:
             message: str,
             images: List[str] = None,
             files: Optional[List[dict]] = None,
+            has_audio_input: bool = False,
     ) -> str:
         """
         处理用户消息，流式推理并返回 Agent 回复
@@ -868,7 +869,8 @@ class MoviePilotAgent:
         try:
             logger.info(
                 f"Agent推理: session_id={self.session_id}, input={message}, "
-                f"images={len(images) if images else 0}, files={len(files) if files else 0}"
+                f"images={len(images) if images else 0}, files={len(files) if files else 0}, "
+                f"audio_input={has_audio_input}"
             )
             self._tool_context = {
                 "user_reply_sent": False,
@@ -885,6 +887,10 @@ class MoviePilotAgent:
             # 构建结构化用户消息内容
             request_payload = {
                 "message": message or "",
+                "input": {
+                    "mode": "voice" if has_audio_input else "text",
+                    "transcribed": bool(has_audio_input),
+                },
                 "images": [
                     {"index": index + 1, "type": "image"}
                     for index, _ in enumerate(images or [])
@@ -1187,6 +1193,7 @@ class _MessageTask:
     message: str
     images: Optional[List[str]] = None
     files: Optional[List[dict]] = None
+    has_audio_input: bool = False
     channel: Optional[str] = None
     source: Optional[str] = None
     username: Optional[str] = None
@@ -1333,6 +1340,7 @@ class AgentManager:
             message: str,
             images: List[str] = None,
             files: Optional[List[dict]] = None,
+            has_audio_input: bool = False,
             channel: str = None,
             source: str = None,
             username: str = None,
@@ -1352,6 +1360,7 @@ class AgentManager:
             message=message,
             images=images,
             files=files,
+            has_audio_input=has_audio_input,
             channel=channel,
             source=source,
             username=username,
@@ -1488,7 +1497,13 @@ class AgentManager:
             agent.persist_output_message = task.persist_output_message
             agent.allow_message_tools = task.allow_message_tools
 
-        return await agent.process(task.message, images=task.images, files=task.files)
+        process_kwargs = {
+            "images": task.images,
+            "files": task.files,
+        }
+        if task.has_audio_input:
+            process_kwargs["has_audio_input"] = True
+        return await agent.process(task.message, **process_kwargs)
 
     async def stop_current_task(self, session_id: str):
         """
diff --git a/app/agent/prompt/System Core Prompt.txt b/app/agent/prompt/System Core Prompt.txt
index 54062653..feb3d408 100644
--- a/app/agent/prompt/System Core Prompt.txt	
+++ b/app/agent/prompt/System Core Prompt.txt	
@@ -35,7 +35,7 @@ You act as a proactive agent. Your goal is to fully resolve the user's media-rel
 - Treat manual download and subscription automation as two execution modes of the same acquisition pipeline. Manual download is user-triggered immediate acquisition; subscription is persistent site-driven monitoring and acquisition.
 - Keep the user anchored to the operational step that matters now: site, search, recognition, download, subscription, transfer, or status/history.
 - Users may attach images from supported channels; analyze them together with the text when relevant.
-- User messages may arrive as structured JSON. Treat the `message` field as the user's text. Attachments appear in `files`; when `local_path` is present, use local file tools to inspect the uploaded file directly. When image input is disabled for the current model, user images may also be delivered through `files`.
+- User messages may arrive as structured JSON. Treat the `message` field as the user's text. Input metadata appears in `input`; when `input.mode` is `voice`, the user sent a voice message and `message` contains its transcript. Attachments appear in `files`; when `local_path` is present, use local file tools to inspect the uploaded file directly. When image input is disabled for the current model, user images may also be delivered through `files`.
 </moviepilot_domain_model>
 
 <operating_principles>
diff --git a/app/agent/prompt/__init__.py b/app/agent/prompt/__init__.py
index 054f454e..30d29c9a 100644
--- a/app/agent/prompt/__init__.py
+++ b/app/agent/prompt/__init__.py
@@ -396,7 +396,12 @@ class PromptManager:
         return (
             "Use normal text replies by default. Only call `send_voice_message` "
             "when the user explicitly asks for a voice reply or spoken playback "
-            "is clearly better than plain text."
+            "is clearly better than plain text. `send_voice_message` is a terminal "
+            "response tool: put the complete user-facing reply in its `message` "
+            "argument, then stop the turn. Do not also call `send_message`, do not "
+            "write a final text reply after it, and do not repeat the same content "
+            "as plain text. If native voice is unavailable, the tool sends the same "
+            "content as a text fallback and still completes the reply."
         )
 
     @staticmethod
@@ -410,9 +415,11 @@ class PromptManager:
         ):
             return (
                 "- User questions: If you need the user to choose from a few clear options, "
-                "call `ask_user_choice` to send button options. After the user clicks a button, "
-                "the selected value will come back as the user's next message. After calling this tool, "
-                "wait for the user's selection instead of repeating the question in plain text."
+                "call `ask_user_choice` to send button options. `ask_user_choice` is a terminal "
+                "interaction tool: put the full question and all options in the tool call, then "
+                "stop the turn and wait for the user's selection. The selected value will come back "
+                "as the user's next message. Do not also call `send_message`, do not write a final "
+                "text reply after it, and do not repeat the question in plain text."
             )
         return "- User questions: When you truly need user input, ask briefly in plain text."
 
diff --git a/app/agent/tools/impl/ask_user_choice.py b/app/agent/tools/impl/ask_user_choice.py
index ccc589c0..3048ca54 100644
--- a/app/agent/tools/impl/ask_user_choice.py
+++ b/app/agent/tools/impl/ask_user_choice.py
@@ -71,7 +71,9 @@ class AskUserChoiceTool(MoviePilotTool):
     return_direct: bool = True
     description: str = (
         "Ask the user to choose from button options on channels that support interactive buttons. "
-        "After the user clicks a button, the selected value will come back as the user's next message."
+        "This is a terminal interaction tool: put the full question and all options in this call, "
+        "then stop the current turn. After the user clicks a button, the selected value will come "
+        "back as the user's next message. Do not also send the same question as plain text."
     )
     args_schema: Type[BaseModel] = AskUserChoiceInput
     require_admin: bool = False
diff --git a/app/agent/tools/impl/send_voice_message.py b/app/agent/tools/impl/send_voice_message.py
index b4f27d15..1012fda6 100644
--- a/app/agent/tools/impl/send_voice_message.py
+++ b/app/agent/tools/impl/send_voice_message.py
@@ -35,7 +35,9 @@ class SendVoiceMessageTool(MoviePilotTool):
         "Send a voice reply to the current user. Use this only when the user explicitly asks for "
         "a voice reply or when spoken playback is clearly better than plain text. On channels "
         "without voice support or when TTS is unavailable, it automatically falls back to sending "
-        "the same content as plain text."
+        "the same content as plain text. This is a terminal response tool: put the complete "
+        "user-facing reply in `message`; after this tool runs, do not send another text reply "
+        "or call `send_message` with the same content."
     )
     args_schema: Type[BaseModel] = SendVoiceMessageInput
     require_admin: bool = False
diff --git a/app/chain/message.py b/app/chain/message.py
index d3825823..07682cf1 100644
--- a/app/chain/message.py
+++ b/app/chain/message.py
@@ -350,6 +350,7 @@ class MessageChain(ChainBase):
                 original_chat_id=original_chat_id,
                 images=images,
                 files=files,
+                has_audio_input=has_audio_input,
             )
 
         if (
@@ -366,6 +367,7 @@ class MessageChain(ChainBase):
                 original_chat_id=original_chat_id,
                 images=images,
                 files=files,
+                has_audio_input=has_audio_input,
             )
 
         if MediaInteractionChain().handle_text_interaction(
@@ -1204,6 +1206,7 @@ class MessageChain(ChainBase):
             images: Optional[List[CommingMessage.MessageImage]] = None,
             files: Optional[List[CommingMessage.MessageAttachment]] = None,
             session_id: Optional[str] = None,
+            has_audio_input: bool = False,
     ) -> bool:
         """
         处理AI智能体消息
@@ -1317,6 +1320,8 @@ class MessageChain(ChainBase):
                 else None,
                 "original_chat_id": original_chat_id,
             }
+            if has_audio_input:
+                process_kwargs["has_audio_input"] = True
             # 在事件循环中处理
             asyncio.run_coroutine_threadsafe(
                 agent_manager.process_message(**process_kwargs),
diff --git a/tests/test_agent_background_output.py b/tests/test_agent_background_output.py
index 3e76aa43..adac5792 100644
--- a/tests/test_agent_background_output.py
+++ b/tests/test_agent_background_output.py
@@ -10,6 +10,7 @@ from app.agent import (
     AgentManager,
     ReplyMode,
     UNSUPPORTED_IMAGE_INPUT_MESSAGE,
+    _MessageTask,
 )
 from app.agent.memory import memory_manager
 from app.agent.tools.factory import MoviePilotToolFactory
@@ -288,6 +289,28 @@ class AgentBackgroundOutputTest(unittest.IsolatedAsyncioTestCase):
 
         process_message.assert_not_awaited()
 
+    async def test_agent_manager_preserves_voice_input_flag(self):
+        """会话队列执行时应把语音输入标记继续传给 Agent。"""
+        manager = AgentManager()
+        agent = MoviePilotAgent(session_id="session-1", user_id="user-1")
+        manager.active_agents["session-1"] = agent
+        agent.process = AsyncMock(return_value="ok")
+        task = _MessageTask(
+            session_id="session-1",
+            user_id="user-1",
+            message="帮我推荐一部电影",
+            has_audio_input=True,
+        )
+
+        await manager._process_message_internal(task)
+
+        agent.process.assert_awaited_once_with(
+            "帮我推荐一部电影",
+            images=None,
+            files=None,
+            has_audio_input=True,
+        )
+
     async def test_create_agent_excludes_activity_log_for_heartbeat_session(self):
         agent = MoviePilotAgent(
             session_id=f"{HEARTBEAT_SESSION_PREFIX}test__",
diff --git a/tests/test_agent_image_support.py b/tests/test_agent_image_support.py
index fababa6d..849add88 100644
--- a/tests/test_agent_image_support.py
+++ b/tests/test_agent_image_support.py
@@ -242,6 +242,7 @@ class AgentImageSupportTest(unittest.TestCase):
 
         handle_ai_message.assert_called_once()
         self.assertEqual(handle_ai_message.call_args.kwargs["text"], "帮我推荐一部电影")
+        self.assertTrue(handle_ai_message.call_args.kwargs["has_audio_input"])
         self.assertNotIn("reply_with_voice", handle_ai_message.call_args.kwargs)
 
     def test_file_message_routes_to_agent_even_when_global_agent_is_disabled(self):
@@ -390,8 +391,36 @@ class AgentImageSupportTest(unittest.TestCase):
         self.assertIsInstance(content, list)
         payload = json.loads(content[0]["text"])
         self.assertEqual(payload["message"], "帮我总结这个文件")
+        self.assertEqual(payload["input"]["mode"], "text")
+        self.assertFalse(payload["input"]["transcribed"])
         self.assertEqual(payload["files"][0]["local_path"], "/tmp/report.txt")
 
+    def test_agent_process_marks_voice_input_in_structured_json(self):
+        """语音输入应在结构化消息中标记为转写来源。"""
+        agent = MoviePilotAgent(
+            session_id="session-1",
+            user_id="user-1",
+            channel=MessageChannel.Telegram.value,
+            source="telegram-test",
+            username="tester",
+        )
+
+        with patch(
+            "app.agent.memory.memory_manager.get_agent_messages", return_value=[]
+        ), patch.object(agent, "_execute_agent", new_callable=AsyncMock) as execute_agent:
+            asyncio.run(
+                agent.process(
+                    "帮我推荐一部电影",
+                    has_audio_input=True,
+                )
+            )
+
+        messages = execute_agent.await_args.args[0]
+        payload = json.loads(messages[-1].content[0]["text"])
+        self.assertEqual(payload["message"], "帮我推荐一部电影")
+        self.assertEqual(payload["input"]["mode"], "voice")
+        self.assertTrue(payload["input"]["transcribed"])
+
     def test_llm_supports_image_input_respects_explicit_override(self):
         with patch.object(settings, "LLM_SUPPORT_IMAGE_INPUT", False):
             self.assertFalse(LLMHelper.supports_image_input())
@@ -447,6 +476,29 @@ class AgentImageSupportTest(unittest.TestCase):
             "/tmp/image_1.jpg",
         )
 
+    def test_handle_ai_message_forwards_voice_input_to_agent_manager(self):
+        """AI消息入队时应保留语音输入标记。"""
+        chain = MessageChain()
+
+        with patch.object(settings, "AI_AGENT_ENABLE", True), patch.object(
+            chain, "_get_or_create_session_id", return_value="session-1"
+        ), patch(
+            "app.chain.message.agent_manager.process_message", new_callable=AsyncMock
+        ) as process_message, patch(
+            "app.chain.message.asyncio.run_coroutine_threadsafe",
+            side_effect=lambda coro, _loop: (coro.close(), Mock())[1],
+        ):
+            chain._handle_ai_message(
+                text="帮我推荐一部电影",
+                channel=MessageChannel.Telegram,
+                source="telegram-test",
+                userid="10001",
+                username="tester",
+                has_audio_input=True,
+            )
+
+        self.assertTrue(process_message.call_args.kwargs["has_audio_input"])
+
     def test_slack_images_use_authenticated_data_url_download(self):
         chain = MessageChain()
 
diff --git a/tests/test_agent_interaction.py b/tests/test_agent_interaction.py
index 67558699..589e46c2 100644
--- a/tests/test_agent_interaction.py
+++ b/tests/test_agent_interaction.py
@@ -30,6 +30,8 @@ class TestAgentInteraction(unittest.TestCase):
         )
 
         self.assertIn("ask_user_choice", telegram_prompt)
+        self.assertIn("terminal interaction tool", telegram_prompt)
+        self.assertIn("do not write a final text reply after it", telegram_prompt)
         self.assertNotIn("ask_user_choice", wechat_prompt)
 
     def test_factory_injects_choice_tool_only_for_button_channels(self):
@@ -60,6 +62,7 @@ class TestAgentInteraction(unittest.TestCase):
         tool = AskUserChoiceTool(session_id="session-1", user_id="10001")
 
         self.assertTrue(tool.return_direct)
+        self.assertIn("terminal interaction tool", tool.description)
 
     def test_choice_tool_sends_buttons_and_registers_pending_request(self):
         tool = AskUserChoiceTool(session_id="session-1", user_id="10001")
diff --git a/tests/test_agent_prompt_style.py b/tests/test_agent_prompt_style.py
index f023575b..3f4f2a06 100644
--- a/tests/test_agent_prompt_style.py
+++ b/tests/test_agent_prompt_style.py
@@ -244,6 +244,24 @@ class TestAgentPromptStyle(unittest.TestCase):
             prompt,
         )
 
+    def test_voice_prompt_marks_voice_tool_as_terminal_reply(self):
+        """语音回复提示词应说明语音工具会结束当前轮次。"""
+        with patch.object(settings, "LLM_SUPPORT_AUDIO_OUTPUT", True):
+            prompt = prompt_manager.get_agent_prompt()
+
+        self.assertIn("send_voice_message", prompt)
+        self.assertIn("terminal response tool", prompt)
+        self.assertIn("do not write a final text reply after it", prompt)
+        self.assertIn("text fallback and still completes the reply", prompt)
+
+    def test_core_prompt_describes_voice_input_metadata(self):
+        """核心提示词应说明结构化消息中的语音输入元信息。"""
+        prompt = prompt_manager.get_agent_prompt()
+
+        self.assertIn("input.mode", prompt)
+        self.assertIn("voice", prompt)
+        self.assertIn("`message` contains its transcript", prompt)
+
     def test_verbose_prompt_does_not_inject_silence_until_tools_finish_rule(self):
         with patch.object(settings, "AI_AGENT_VERBOSE", True):
             prompt = prompt_manager.get_agent_prompt()
diff --git a/tests/test_agent_tool_streaming.py b/tests/test_agent_tool_streaming.py
index db57df2f..84c1a0a9 100644
--- a/tests/test_agent_tool_streaming.py
+++ b/tests/test_agent_tool_streaming.py
@@ -441,7 +441,9 @@ class TestAgentToolStreaming(unittest.TestCase):
             self.assertEqual(notification.channel, channel)
             self.assertEqual(notification.voice_path, "/tmp/reply.opus")
             self.assertEqual(notification.voice_caption, "你好")
-            self.assertTrue(SendVoiceMessageTool.return_direct)
+            voice_tool = SendVoiceMessageTool(session_id="session-1", user_id="10001")
+            self.assertTrue(voice_tool.return_direct)
+            self.assertIn("terminal response tool", voice_tool.description)
 
     def test_send_voice_message_falls_back_for_unsupported_channels(self):
         """校验不支持语音输出的渠道继续回退为文字消息。"""