Merge pull request #1115 from Windpicker-owo/dev

增加对语音消息的支持
2025-07-18 13:12:59 +08:00
parent 13c211c785 93f150f95e
commit 6245f9de0a
5 changed files with 161 additions and 23 deletions
--- a/src/chat/message_receive/message.py
+++ b/src/chat/message_receive/message.py
@@ -9,6 +9,7 @@ from maim_message import Seg, UserInfo, BaseMessageInfo, MessageBase

 from src.common.logger import get_logger
 from src.chat.utils.utils_image import get_image_manager
+from src.chat.utils.utils_voice import get_voice_text
 from .chat_stream import ChatStream

 install(extra_lines=3)
@@ -106,6 +107,7 @@ class MessageRecv(Message):
        self.has_emoji = False
        self.is_picid = False
        self.has_picid = False
+        self.is_voice = False
        self.is_mentioned = None

        self.is_command = False
@@ -153,17 +155,27 @@ class MessageRecv(Message):
                self.has_emoji = True
                self.is_emoji = True
                self.is_picid = False
+                self.is_voice = False
                if isinstance(segment.data, str):
                    return await get_image_manager().get_emoji_description(segment.data)
                return "[发了一个表情包，网卡了加载不出来]"
+            elif segment.type == "voice":
+                self.is_picid = False
+                self.is_emoji = False
+                self.is_voice = True
+                if isinstance(segment.data, str):
+                    return await get_voice_text(segment.data)
+                return "[发了一段语音，网卡了加载不出来]"
            elif segment.type == "mention_bot":
                self.is_picid = False
                self.is_emoji = False
+                self.is_voice = False
                self.is_mentioned = float(segment.data)  # type: ignore
                return ""
            elif segment.type == "priority_info":
                self.is_picid = False
                self.is_emoji = False
+                self.is_voice = False
                if isinstance(segment.data, dict):
                    # 处理优先级信息
                    self.priority_mode = "priority"
@@ -212,10 +224,12 @@ class MessageRecvS4U(MessageRecv):
        """
        try:
            if segment.type == "text":
+                self.is_voice = False
                self.is_picid = False
                self.is_emoji = False
                return segment.data  # type: ignore
            elif segment.type == "image":
+                self.is_voice = False
                # 如果是base64图片数据
                if isinstance(segment.data, str):
                    self.has_picid = True
@@ -233,12 +247,22 @@ class MessageRecvS4U(MessageRecv):
                if isinstance(segment.data, str):
                    return await get_image_manager().get_emoji_description(segment.data)
                return "[发了一个表情包，网卡了加载不出来]"
+            elif segment.type == "voice":
+                self.has_picid = False
+                self.is_picid = False
+                self.is_emoji = False
+                self.is_voice = True
+                if isinstance(segment.data, str):
+                    return await get_voice_text(segment.data)
+                return "[发了一段语音，网卡了加载不出来]"
            elif segment.type == "mention_bot":
+                self.is_voice = False
                self.is_picid = False
                self.is_emoji = False
                self.is_mentioned = float(segment.data)  # type: ignore
                return ""
            elif segment.type == "priority_info":
+                self.is_voice = False
                self.is_picid = False
                self.is_emoji = False
                if isinstance(segment.data, dict):
@@ -253,6 +277,7 @@ class MessageRecvS4U(MessageRecv):
                    """
                return ""
            elif segment.type == "gift":
+                self.is_voice = False
                self.is_gift = True
                # 解析gift_info，格式为"名称:数量"
                name, count = segment.data.split(":", 1)  # type: ignore
@@ -343,6 +368,10 @@ class MessageProcessBase(Message):
                if isinstance(seg.data, str):
                    return await get_image_manager().get_emoji_description(seg.data)
                return "[表情，网卡了加载不出来]"
+            elif seg.type == "voice":
+                if isinstance(seg.data, str):
+                    return await get_voice_text(seg.data)
+                return "[发了一段语音，网卡了加载不出来]"
            elif seg.type == "at":
                return f"[@{seg.data}]"
            elif seg.type == "reply":
--- a/src/chat/utils/utils_voice.py
+++ b/src/chat/utils/utils_voice.py
@@ -0,0 +1,35 @@
+import base64
+
+from src.config.config import global_config
+from src.llm_models.utils_model import LLMRequest
+
+from src.common.logger import get_logger
+from rich.traceback import install
+install(extra_lines=3)
+
+logger = get_logger("chat_voice")
+
+async def get_voice_text(voice_base64: str) -> str:
+    """获取音频文件描述"""
+    if not global_config.chat.enable_asr:
+        logger.warning("语音识别未启用，无法处理语音消息")
+        return "[语音]"
+    try:
+        # 解码base64音频数据
+        # 确保base64字符串只包含ASCII字符
+        if isinstance(voice_base64, str):
+            voice_base64 = voice_base64.encode("ascii", errors="ignore").decode("ascii")
+        voice_bytes = base64.b64decode(voice_base64)
+        _llm = LLMRequest(model=global_config.model.voice, request_type="voice")
+        text = await _llm.generate_response_for_voice(voice_bytes)
+        if text is None:
+            logger.warning("未能生成语音文本")
+            return "[语音(文本生成失败)]"
+        
+        logger.debug(f"描述是{text}")
+
+        return f"[语音：{text}]"
+    except Exception as e:
+        logger.error(f"语音转文字失败: {str(e)}")
+        return "[语音]"
+
--- a/src/config/official_configs.py
+++ b/src/config/official_configs.py
@@ -106,6 +106,9 @@ class ChatConfig(ConfigBase):
    focus_value: float = 1.0
    """麦麦的专注思考能力，越低越容易专注，消耗token也越多"""

+    enable_asr: bool = False
+    """是否启用语音识别"""
+
    def get_current_talk_frequency(self, chat_stream_id: Optional[str] = None) -> float:
        """
        根据当前时间和聊天流获取对应的 talk_frequency
@@ -630,6 +633,9 @@ class ModelConfig(ConfigBase):
    vlm: dict[str, Any] = field(default_factory=lambda: {})
    """视觉语言模型配置"""

+    voice: dict[str, Any] = field(default_factory=lambda: {})
+    """语音识别模型配置"""
+
    tool_use: dict[str, Any] = field(default_factory=lambda: {})
    """专注工具使用模型配置"""

--- a/src/llm_models/utils_model.py
+++ b/src/llm_models/utils_model.py
@@ -216,6 +216,8 @@ class LLMRequest:
        prompt: str = None,
        image_base64: str = None,
        image_format: str = None,
+        file_bytes: bytes = None,
+        file_format: str = None,
        payload: dict = None,
        retry_policy: dict = None,
    ) -> Dict[str, Any]:
@@ -225,6 +227,8 @@ class LLMRequest:
            prompt: prompt文本
            image_base64: 图片的base64编码
            image_format: 图片格式
+            file_bytes: 文件的二进制数据
+            file_format: 文件格式
            payload: 请求体数据
            retry_policy: 自定义重试策略
            request_type: 请求类型
@@ -246,9 +250,12 @@ class LLMRequest:
        # 构建请求体
        if image_base64:
            payload = await self._build_payload(prompt, image_base64, image_format)
+        elif file_bytes:
+            payload = await self._build_formdata_payload(file_bytes, file_format)
        elif payload is None:
            payload = await self._build_payload(prompt)

+        if not file_bytes:
            if stream_mode:
                payload["stream"] = stream_mode

@@ -278,6 +285,8 @@ class LLMRequest:
            "stream_mode": stream_mode,
            "image_base64": image_base64,  # 保留必要的exception处理所需的原始数据
            "image_format": image_format,
+            "file_bytes": file_bytes,
+            "file_format": file_format,
            "prompt": prompt,
        }

@@ -287,6 +296,8 @@ class LLMRequest:
        prompt: str = None,
        image_base64: str = None,
        image_format: str = None,
+        file_bytes: bytes = None,
+        file_format: str = None,
        payload: dict = None,
        retry_policy: dict = None,
        response_handler: callable = None,
@@ -299,6 +310,8 @@ class LLMRequest:
            prompt: prompt文本
            image_base64: 图片的base64编码
            image_format: 图片格式
+            file_bytes: 文件的二进制数据
+            file_format: 文件格式
            payload: 请求体数据
            retry_policy: 自定义重试策略
            response_handler: 自定义响应处理器
@@ -307,25 +320,36 @@ class LLMRequest:
        """
        # 获取请求配置
        request_content = await self._prepare_request(
-            endpoint, prompt, image_base64, image_format, payload, retry_policy
+            endpoint, prompt, image_base64, image_format, file_bytes, file_format, payload, retry_policy
        )
        if request_type is None:
            request_type = self.request_type
        for retry in range(request_content["policy"]["max_retries"]):
            try:
                # 使用上下文管理器处理会话
-                headers = await self._build_headers()
+                if file_bytes:
+                    headers = await self._build_headers(is_formdata=True)
+                else:
+                    headers = await self._build_headers(is_formdata=False)
                # 似乎是openai流式必须要的东西,不过阿里云的qwq-plus加了这个没有影响
                if request_content["stream_mode"]:
                    headers["Accept"] = "text/event-stream"
                async with aiohttp.ClientSession(connector=await get_tcp_connector()) as session:
+                    post_kwargs = {"headers": headers}
+                    #form-data数据上传方式不同
+                    if file_bytes:
+                        post_kwargs["data"] = request_content["payload"]
+                    else:
+                        post_kwargs["json"] = request_content["payload"]
+
                    async with session.post(                            
-                        request_content["api_url"], headers=headers, json=request_content["payload"]
+                        request_content["api_url"], **post_kwargs
                    ) as response:
                        handled_result = await self._handle_response(
                            response, request_content, retry, response_handler, user_id, request_type, endpoint
                        )
                        return handled_result             
+
            except Exception as e:
                handled_payload, count_delta = await self._handle_exception(e, retry, request_content)
                retry += count_delta  # 降级不计入重试次数
@@ -640,6 +664,33 @@ class LLMRequest:
                new_params["max_completion_tokens"] = new_params.pop("max_tokens")
        return new_params

+    async def _build_formdata_payload(self, file_bytes: bytes, file_format: str) -> aiohttp.FormData:
+        """构建form-data请求体"""
+        # 目前只适配了音频文件
+        # 如果后续要支持其他类型的文件，可以在这里添加更多的处理逻辑
+        data = aiohttp.FormData()
+        content_type_list = {
+            "wav": "audio/wav",
+            "mp3": "audio/mpeg",
+            "ogg": "audio/ogg",
+            "flac": "audio/flac",
+            "aac": "audio/aac",
+        }
+
+        content_type = content_type_list.get(file_format)
+        if not content_type:
+            logger.warning(f"暂不支持的文件类型: {file_format}")
+
+        data.add_field(
+            "file",io.BytesIO(file_bytes),
+            filename=f"file.{file_format}",
+            content_type=f'{content_type}' # 根据实际文件类型设置
+        )
+        data.add_field(
+            "model", self.model_name
+        )
+        return data
+    
    async def _build_payload(self, prompt: str, image_base64: str = None, image_format: str = None) -> dict:
        """构建请求体"""
        # 复制一份参数，避免直接修改 self.params
@@ -725,7 +776,8 @@ class LLMRequest:
                return content, reasoning_content, tool_calls
            else:
                return content, reasoning_content
-
+        elif "text" in result and result["text"]:
+            return result["text"]
        return "没有返回结果", ""

    @staticmethod
@@ -739,11 +791,15 @@ class LLMRequest:
            reasoning = ""
        return content, reasoning

-    async def _build_headers(self, no_key: bool = False) -> dict:
+    async def _build_headers(self, no_key: bool = False, is_formdata: bool = False) -> dict:
        """构建请求头"""
        if no_key:
+            if is_formdata:
+                return {"Authorization": "Bearer **********"}
            return {"Authorization": "Bearer **********", "Content-Type": "application/json"}
        else:
+            if is_formdata:
+                return {"Authorization": f"Bearer {self.api_key}"}
            return {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
            # 防止小朋友们截图自己的key

@@ -761,6 +817,11 @@ class LLMRequest:
            content, reasoning_content = response
            return content, reasoning_content

+    async def generate_response_for_voice(self, voice_bytes: bytes) -> Tuple:
+        """根据输入的语音文件生成模型的异步响应"""
+        response = await self._execute_request(endpoint="/audio/transcriptions",file_bytes=voice_bytes, file_format='wav')
+        return response
+    
    async def generate_response_async(self, prompt: str, **kwargs) -> Union[str, Tuple]:
        """异步方式根据输入的提示生成模型的响应"""
        # 构建请求体，不硬编码max_tokens
--- a/template/bot_config_template.toml
+++ b/template/bot_config_template.toml
@@ -87,6 +87,7 @@ talk_frequency_adjust = [
 # - 时间支持跨天，例如 "00:10,0.3" 表示从凌晨0:10开始使用频率0.3
 # - 系统会自动将 "platform:id:type" 转换为内部的哈希chat_id进行匹配

+enable_asr = false # 是否启用语音识别，启用后麦麦可以通过语音输入进行对话，启用该功能需要配置语音识别模型[model.voice]

 [message_receive]
 # 以下是消息过滤，可以根据规则过滤特定消息，将不会读取这些消息
@@ -294,6 +295,12 @@ provider = "SILICONFLOW"
 pri_in = 0.35
 pri_out = 0.35

+[model.voice] # 语音识别模型
+name = "FunAudioLLM/SenseVoiceSmall"
+provider = "SILICONFLOW"
+pri_in = 0
+pri_out = 0
+
 [model.tool_use] #工具调用模型，需要使用支持工具调用的模型
 name = "Qwen/Qwen3-14B"
 provider = "SILICONFLOW"