From 587aca4d1846e6e0488e75dce2ce91717efea9b2 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 17 Jul 2025 14:50:19 +0800 Subject: [PATCH 1/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9voice=E7=B1=BB?= =?UTF-8?q?=E5=9E=8B=E6=B6=88=E6=81=AF=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/message_receive/message.py | 22 ++++++ src/chat/utils/utils_voice.py | 46 ++++++++++++ src/config/official_configs.py | 3 + src/llm_models/utils_model.py | 107 +++++++++++++++++++++------- template/bot_config_template.toml | 6 ++ 5 files changed, 157 insertions(+), 27 deletions(-) create mode 100644 src/chat/utils/utils_voice.py diff --git a/src/chat/message_receive/message.py b/src/chat/message_receive/message.py index b179a3098..a0241fe01 100644 --- a/src/chat/message_receive/message.py +++ b/src/chat/message_receive/message.py @@ -9,6 +9,7 @@ from maim_message import Seg, UserInfo, BaseMessageInfo, MessageBase from src.common.logger import get_logger from src.chat.utils.utils_image import get_image_manager +from src.chat.utils.utils_voice import get_voice_text from .chat_stream import ChatStream install(extra_lines=3) @@ -106,6 +107,7 @@ class MessageRecv(Message): self.has_emoji = False self.is_picid = False self.has_picid = False + self.is_voice = False self.is_mentioned = None self.is_command = False @@ -156,6 +158,14 @@ class MessageRecv(Message): if isinstance(segment.data, str): return await get_image_manager().get_emoji_description(segment.data) return "[发了一个表情包,网卡了加载不出来]" + elif segment.type == "voice": + self.has_picid = False + self.is_picid = False + self.is_emoji = False + self.is_voice == True + if isinstance(segment.data, str): + return await get_voice_text(segment.data) + return "[发了一段语音,网卡了加载不出来]" elif segment.type == "mention_bot": self.is_picid = False self.is_emoji = False @@ -233,6 +243,14 @@ class MessageRecvS4U(MessageRecv): if isinstance(segment.data, str): return await get_image_manager().get_emoji_description(segment.data) return "[发了一个表情包,网卡了加载不出来]" + elif segment.type == "voice": + self.has_picid = False + self.is_picid = False + self.is_emoji = False + self.is_voice == True + if isinstance(segment.data, str): + return await get_voice_text(segment.data) + return "[发了一段语音,网卡了加载不出来]" elif segment.type == "mention_bot": self.is_picid = False self.is_emoji = False @@ -343,6 +361,10 @@ class MessageProcessBase(Message): if isinstance(seg.data, str): return await get_image_manager().get_emoji_description(seg.data) return "[表情,网卡了加载不出来]" + elif seg.type == "voice": + if isinstance(seg.data, str): + return await get_voice_text(seg.data) + return "[发了一段语音,网卡了加载不出来]" elif seg.type == "at": return f"[@{seg.data}]" elif seg.type == "reply": diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py new file mode 100644 index 000000000..9dbf9933b --- /dev/null +++ b/src/chat/utils/utils_voice.py @@ -0,0 +1,46 @@ +import base64 +import os +import time +import hashlib +import uuid +from typing import Optional, Tuple +from PIL import Image +import io +import numpy as np +import asyncio + + +from src.common.database.database import db +from src.common.database.database_model import Images, ImageDescriptions +from src.config.config import global_config +from src.llm_models.utils_model import LLMRequest + +from src.common.logger import get_logger +from rich.traceback import install +import traceback +install(extra_lines=3) + +logger = get_logger("chat_voice") + +async def get_voice_text(voice_base64: str) -> str: + """获取音频文件描述""" + try: + # 计算图片哈希 + # 确保base64字符串只包含ASCII字符 + if isinstance(voice_base64, str): + voice_base64 = voice_base64.encode("ascii", errors="ignore").decode("ascii") + voice_bytes = base64.b64decode(voice_base64) + _llm = LLMRequest(model=global_config.model.voice, request_type="voice") + text = await _llm.generate_response_for_voice(voice_bytes) + if text is None: + logger.warning("未能生成语音文本") + return "[语音(文本生成失败)]" + + logger.debug(f"描述是{text}") + + return f"[语音:{text}]" + except Exception as e: + traceback.print_exc() + logger.error(f"语音转文字失败: {str(e)}") + return "[语音]" + diff --git a/src/config/official_configs.py b/src/config/official_configs.py index 67b314f7f..c3ce1aba2 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -630,6 +630,9 @@ class ModelConfig(ConfigBase): vlm: dict[str, Any] = field(default_factory=lambda: {}) """视觉语言模型配置""" + voice: dict[str, Any] = field(default_factory=lambda: {}) + """视觉语言模型配置""" + tool_use: dict[str, Any] = field(default_factory=lambda: {}) """专注工具使用模型配置""" diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 1077cfa09..a81fc09d6 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -216,6 +216,8 @@ class LLMRequest: prompt: str = None, image_base64: str = None, image_format: str = None, + file_bytes: str = None, + file_format: str = None, payload: dict = None, retry_policy: dict = None, ) -> Dict[str, Any]: @@ -225,6 +227,8 @@ class LLMRequest: prompt: prompt文本 image_base64: 图片的base64编码 image_format: 图片格式 + file_bytes: 文件的二进制数据 + file_format: 文件格式 payload: 请求体数据 retry_policy: 自定义重试策略 request_type: 请求类型 @@ -246,30 +250,33 @@ class LLMRequest: # 构建请求体 if image_base64: payload = await self._build_payload(prompt, image_base64, image_format) + elif file_bytes: + payload = await self._build_formdata_payload(file_bytes, file_format) elif payload is None: payload = await self._build_payload(prompt) - if stream_mode: - payload["stream"] = stream_mode + if not file_bytes: + if stream_mode: + payload["stream"] = stream_mode - if self.temp != 0.7: - payload["temperature"] = self.temp + if self.temp != 0.7: + payload["temperature"] = self.temp - # 添加enable_thinking参数(如果不是默认值False) - if not self.enable_thinking: - payload["enable_thinking"] = False + # 添加enable_thinking参数(如果不是默认值False) + if not self.enable_thinking: + payload["enable_thinking"] = False - if self.thinking_budget != 4096: - payload["thinking_budget"] = self.thinking_budget + if self.thinking_budget != 4096: + payload["thinking_budget"] = self.thinking_budget - if self.max_tokens: - payload["max_tokens"] = self.max_tokens + if self.max_tokens: + payload["max_tokens"] = self.max_tokens - # if "max_tokens" not in payload and "max_completion_tokens" not in payload: - # payload["max_tokens"] = global_config.model.model_max_output_length - # 如果 payload 中依然存在 max_tokens 且需要转换,在这里进行再次检查 - if self.model_name.lower() in self.MODELS_NEEDING_TRANSFORMATION and "max_tokens" in payload: - payload["max_completion_tokens"] = payload.pop("max_tokens") + # if "max_tokens" not in payload and "max_completion_tokens" not in payload: + # payload["max_tokens"] = global_config.model.model_max_output_length + # 如果 payload 中依然存在 max_tokens 且需要转换,在这里进行再次检查 + if self.model_name.lower() in self.MODELS_NEEDING_TRANSFORMATION and "max_tokens" in payload: + payload["max_completion_tokens"] = payload.pop("max_tokens") return { "policy": policy, @@ -278,6 +285,8 @@ class LLMRequest: "stream_mode": stream_mode, "image_base64": image_base64, # 保留必要的exception处理所需的原始数据 "image_format": image_format, + "file_bytes": file_bytes, + "file_format": file_format, "prompt": prompt, } @@ -287,6 +296,8 @@ class LLMRequest: prompt: str = None, image_base64: str = None, image_format: str = None, + file_bytes: str = None, + file_format: str = None, payload: dict = None, retry_policy: dict = None, response_handler: callable = None, @@ -299,6 +310,8 @@ class LLMRequest: prompt: prompt文本 image_base64: 图片的base64编码 image_format: 图片格式 + file_base64: 文件的二进制数据 + file_format: 文件格式 payload: 请求体数据 retry_policy: 自定义重试策略 response_handler: 自定义响应处理器 @@ -307,25 +320,38 @@ class LLMRequest: """ # 获取请求配置 request_content = await self._prepare_request( - endpoint, prompt, image_base64, image_format, payload, retry_policy + endpoint, prompt, image_base64, image_format, file_bytes, file_format, payload, retry_policy ) if request_type is None: request_type = self.request_type for retry in range(request_content["policy"]["max_retries"]): try: # 使用上下文管理器处理会话 - headers = await self._build_headers() + if file_bytes: + headers = await self._build_headers(is_formdata=True) + else: + headers = await self._build_headers(is_formdata=False) # 似乎是openai流式必须要的东西,不过阿里云的qwq-plus加了这个没有影响 if request_content["stream_mode"]: headers["Accept"] = "text/event-stream" async with aiohttp.ClientSession(connector=await get_tcp_connector()) as session: - async with session.post( - request_content["api_url"], headers=headers, json=request_content["payload"] - ) as response: - handled_result = await self._handle_response( - response, request_content, retry, response_handler, user_id, request_type, endpoint - ) - return handled_result + if file_bytes: + #form-data数据上传方式不同 + async with session.post( + request_content["api_url"], headers=headers, data=request_content["payload"] + ) as response: + handled_result = await self._handle_response( + response, request_content, retry, response_handler, user_id, request_type, endpoint + ) + return handled_result + else: + async with session.post( + request_content["api_url"], headers=headers, json=request_content["payload"] + ) as response: + handled_result = await self._handle_response( + response, request_content, retry, response_handler, user_id, request_type, endpoint + ) + return handled_result except Exception as e: handled_payload, count_delta = await self._handle_exception(e, retry, request_content) retry += count_delta # 降级不计入重试次数 @@ -640,6 +666,23 @@ class LLMRequest: new_params["max_completion_tokens"] = new_params.pop("max_tokens") return new_params + async def _build_formdata_payload(self, file_bytes: str, file_format: str): + """构建form-data请求体""" + # 非常丑陋的方法,先将文件写入本地,然后再读取,应该有更好的办法 + with open(f"file.{file_format}","wb") as f: + f.write(file_bytes) + + data = aiohttp.FormData() + data.add_field( + "file",open(f"file.{file_format}","rb"), + filename=f"file.{file_format}", + content_type='audio/wav' + ) + data.add_field( + "model", self.model_name + ) + return data + async def _build_payload(self, prompt: str, image_base64: str = None, image_format: str = None) -> dict: """构建请求体""" # 复制一份参数,避免直接修改 self.params @@ -725,7 +768,8 @@ class LLMRequest: return content, reasoning_content, tool_calls else: return content, reasoning_content - + elif "text" in result and result["text"]: + return result["text"] return "没有返回结果", "" @staticmethod @@ -739,11 +783,15 @@ class LLMRequest: reasoning = "" return content, reasoning - async def _build_headers(self, no_key: bool = False) -> dict: + async def _build_headers(self, no_key: bool = False, is_formdata: bool = False) -> dict: """构建请求头""" if no_key: + if is_formdata: + return {"Authorization": "Bearer **********"} return {"Authorization": "Bearer **********", "Content-Type": "application/json"} else: + if is_formdata: + return {"Authorization": f"Bearer {self.api_key}"} return {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} # 防止小朋友们截图自己的key @@ -761,6 +809,11 @@ class LLMRequest: content, reasoning_content = response return content, reasoning_content + async def generate_response_for_voice(self, voice_bytes: bytes) -> Tuple: + """根据输入的语音文件生成模型的异步响应""" + response = await self._execute_request(endpoint="/audio/transcriptions",file_bytes=voice_bytes, file_format='wav') + return response + async def generate_response_async(self, prompt: str, **kwargs) -> Union[str, Tuple]: """异步方式根据输入的提示生成模型的响应""" # 构建请求体,不硬编码max_tokens diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index fbb816621..87110f329 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -294,6 +294,12 @@ provider = "SILICONFLOW" pri_in = 0.35 pri_out = 0.35 +[model.voice] # 语音识别模型 +name = "FunAudioLLM/SenseVoiceSmall" +provider = "SILICONFLOW" +pri_in = 0 +pri_out = 0 + [model.tool_use] #工具调用模型,需要使用支持工具调用的模型 name = "Qwen/Qwen3-14B" provider = "SILICONFLOW" From 835ea2435191606a2e5f30c04039fe469d6489b0 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 17 Jul 2025 15:01:12 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86config=E6=B3=A8?= =?UTF-8?q?=E9=87=8A=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/config/official_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/official_configs.py b/src/config/official_configs.py index c3ce1aba2..68d9468e1 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -631,7 +631,7 @@ class ModelConfig(ConfigBase): """视觉语言模型配置""" voice: dict[str, Any] = field(default_factory=lambda: {}) - """视觉语言模型配置""" + """语音识别模型配置""" tool_use: dict[str, Any] = field(default_factory=lambda: {}) """专注工具使用模型配置""" From 367be4e7d7bfb5de808ae90d8b6fc3e4ab98bdc9 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 17 Jul 2025 15:12:20 +0800 Subject: [PATCH 3/7] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BA=86=E9=83=A8?= =?UTF-8?q?=E5=88=86=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/message_receive/message.py | 4 ++-- src/chat/utils/utils_voice.py | 4 +--- src/llm_models/utils_model.py | 18 ++++++++++++------ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/chat/message_receive/message.py b/src/chat/message_receive/message.py index a0241fe01..11b8c86c3 100644 --- a/src/chat/message_receive/message.py +++ b/src/chat/message_receive/message.py @@ -162,7 +162,7 @@ class MessageRecv(Message): self.has_picid = False self.is_picid = False self.is_emoji = False - self.is_voice == True + self.is_voice = True if isinstance(segment.data, str): return await get_voice_text(segment.data) return "[发了一段语音,网卡了加载不出来]" @@ -247,7 +247,7 @@ class MessageRecvS4U(MessageRecv): self.has_picid = False self.is_picid = False self.is_emoji = False - self.is_voice == True + self.is_voice = True if isinstance(segment.data, str): return await get_voice_text(segment.data) return "[发了一段语音,网卡了加载不出来]" diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py index 9dbf9933b..960ea0b1f 100644 --- a/src/chat/utils/utils_voice.py +++ b/src/chat/utils/utils_voice.py @@ -17,7 +17,6 @@ from src.llm_models.utils_model import LLMRequest from src.common.logger import get_logger from rich.traceback import install -import traceback install(extra_lines=3) logger = get_logger("chat_voice") @@ -25,7 +24,7 @@ logger = get_logger("chat_voice") async def get_voice_text(voice_base64: str) -> str: """获取音频文件描述""" try: - # 计算图片哈希 + # 解码base64音频数据 # 确保base64字符串只包含ASCII字符 if isinstance(voice_base64, str): voice_base64 = voice_base64.encode("ascii", errors="ignore").decode("ascii") @@ -40,7 +39,6 @@ async def get_voice_text(voice_base64: str) -> str: return f"[语音:{text}]" except Exception as e: - traceback.print_exc() logger.error(f"语音转文字失败: {str(e)}") return "[语音]" diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index a81fc09d6..9d834afe9 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -668,15 +668,21 @@ class LLMRequest: async def _build_formdata_payload(self, file_bytes: str, file_format: str): """构建form-data请求体""" - # 非常丑陋的方法,先将文件写入本地,然后再读取,应该有更好的办法 - with open(f"file.{file_format}","wb") as f: - f.write(file_bytes) - + # 目前只适配了音频文件 + # 如果后续要支持其他类型的文件,可以在这里添加更多的处理逻辑 data = aiohttp.FormData() + content_type_list = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "ogg": "audio/ogg", + "flac": "audio/flac", + "aac": "audio/aac", + } + data.add_field( - "file",open(f"file.{file_format}","rb"), + "file",io.BytesIO(file_bytes), filename=f"file.{file_format}", - content_type='audio/wav' + content_type=f'audio/{content_type_list[file_format]}' # 根据实际文件类型设置 ) data.add_field( "model", self.model_name From 830acaf35fa97557d4448e5fca3a57aef074a981 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 17 Jul 2025 15:35:13 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BA=86=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E8=A7=84=E8=8C=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/message_receive/message.py | 9 ++++++- src/chat/utils/utils_voice.py | 12 --------- src/llm_models/utils_model.py | 40 +++++++++++++++-------------- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/src/chat/message_receive/message.py b/src/chat/message_receive/message.py index 11b8c86c3..1346e73c5 100644 --- a/src/chat/message_receive/message.py +++ b/src/chat/message_receive/message.py @@ -155,11 +155,11 @@ class MessageRecv(Message): self.has_emoji = True self.is_emoji = True self.is_picid = False + self.is_voice = False if isinstance(segment.data, str): return await get_image_manager().get_emoji_description(segment.data) return "[发了一个表情包,网卡了加载不出来]" elif segment.type == "voice": - self.has_picid = False self.is_picid = False self.is_emoji = False self.is_voice = True @@ -169,11 +169,13 @@ class MessageRecv(Message): elif segment.type == "mention_bot": self.is_picid = False self.is_emoji = False + self.is_voice = False self.is_mentioned = float(segment.data) # type: ignore return "" elif segment.type == "priority_info": self.is_picid = False self.is_emoji = False + self.is_voice = False if isinstance(segment.data, dict): # 处理优先级信息 self.priority_mode = "priority" @@ -222,10 +224,12 @@ class MessageRecvS4U(MessageRecv): """ try: if segment.type == "text": + self.is_voice = False self.is_picid = False self.is_emoji = False return segment.data # type: ignore elif segment.type == "image": + self.is_voice = False # 如果是base64图片数据 if isinstance(segment.data, str): self.has_picid = True @@ -252,11 +256,13 @@ class MessageRecvS4U(MessageRecv): return await get_voice_text(segment.data) return "[发了一段语音,网卡了加载不出来]" elif segment.type == "mention_bot": + self.is_voice = False self.is_picid = False self.is_emoji = False self.is_mentioned = float(segment.data) # type: ignore return "" elif segment.type == "priority_info": + self.is_voice = False self.is_picid = False self.is_emoji = False if isinstance(segment.data, dict): @@ -271,6 +277,7 @@ class MessageRecvS4U(MessageRecv): """ return "" elif segment.type == "gift": + self.is_voice = False self.is_gift = True # 解析gift_info,格式为"名称:数量" name, count = segment.data.split(":", 1) # type: ignore diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py index 960ea0b1f..feab92cf0 100644 --- a/src/chat/utils/utils_voice.py +++ b/src/chat/utils/utils_voice.py @@ -1,17 +1,5 @@ import base64 -import os -import time -import hashlib -import uuid -from typing import Optional, Tuple -from PIL import Image -import io -import numpy as np -import asyncio - -from src.common.database.database import db -from src.common.database.database_model import Images, ImageDescriptions from src.config.config import global_config from src.llm_models.utils_model import LLMRequest diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 9d834afe9..7270587e6 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -310,7 +310,7 @@ class LLMRequest: prompt: prompt文本 image_base64: 图片的base64编码 image_format: 图片格式 - file_base64: 文件的二进制数据 + file_bytes: 文件的二进制数据 file_format: 文件格式 payload: 请求体数据 retry_policy: 自定义重试策略 @@ -335,23 +335,21 @@ class LLMRequest: if request_content["stream_mode"]: headers["Accept"] = "text/event-stream" async with aiohttp.ClientSession(connector=await get_tcp_connector()) as session: + post_kwargs = {"headers": headers} + #form-data数据上传方式不同 if file_bytes: - #form-data数据上传方式不同 - async with session.post( - request_content["api_url"], headers=headers, data=request_content["payload"] - ) as response: - handled_result = await self._handle_response( - response, request_content, retry, response_handler, user_id, request_type, endpoint - ) - return handled_result + post_kwargs["data"] = request_content["payload"] else: - async with session.post( - request_content["api_url"], headers=headers, json=request_content["payload"] - ) as response: - handled_result = await self._handle_response( - response, request_content, retry, response_handler, user_id, request_type, endpoint - ) - return handled_result + post_kwargs["json"] = request_content["payload"] + + async with session.post( + request_content["api_url"], **post_kwargs + ) as response: + handled_result = await self._handle_response( + response, request_content, retry, response_handler, user_id, request_type, endpoint + ) + return handled_result + except Exception as e: handled_payload, count_delta = await self._handle_exception(e, retry, request_content) retry += count_delta # 降级不计入重试次数 @@ -666,7 +664,7 @@ class LLMRequest: new_params["max_completion_tokens"] = new_params.pop("max_tokens") return new_params - async def _build_formdata_payload(self, file_bytes: str, file_format: str): + async def _build_formdata_payload(self, file_bytes: str, file_format: str) -> aiohttp.FormData: """构建form-data请求体""" # 目前只适配了音频文件 # 如果后续要支持其他类型的文件,可以在这里添加更多的处理逻辑 @@ -678,11 +676,15 @@ class LLMRequest: "flac": "audio/flac", "aac": "audio/aac", } - + + content_type = content_type_list.get(file_format) + if not content_type: + logger.warning(f"暂不支持的文件类型: {file_format}") + data.add_field( "file",io.BytesIO(file_bytes), filename=f"file.{file_format}", - content_type=f'audio/{content_type_list[file_format]}' # 根据实际文件类型设置 + content_type=f'{content_type_list[file_format]}' # 根据实际文件类型设置 ) data.add_field( "model", self.model_name From 2636e9d55a82c517220c7cf69cbeafa1822460c7 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 17 Jul 2025 15:47:33 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=86file=5Fbytes=E7=9A=84=E7=B1=BB=E5=9E=8B=E6=A0=87?= =?UTF-8?q?=E6=B3=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/llm_models/utils_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 7270587e6..511835c83 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -664,7 +664,7 @@ class LLMRequest: new_params["max_completion_tokens"] = new_params.pop("max_tokens") return new_params - async def _build_formdata_payload(self, file_bytes: str, file_format: str) -> aiohttp.FormData: + async def _build_formdata_payload(self, file_bytes: bytes, file_format: str) -> aiohttp.FormData: """构建form-data请求体""" # 目前只适配了音频文件 # 如果后续要支持其他类型的文件,可以在这里添加更多的处理逻辑 From 3d9f1a1d5ac69f8f4de2dc235fd73493851d5a82 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 18 Jul 2025 13:02:38 +0800 Subject: [PATCH 6/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86enable=5Fasr?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E9=80=89=E9=A1=B9=EF=BC=8C=E6=9B=B4=E6=94=B9?= =?UTF-8?q?=E4=B8=80=E5=A4=84=E6=BD=9C=E5=9C=A8=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/utils/utils_voice.py | 3 +++ src/config/official_configs.py | 3 +++ src/llm_models/utils_model.py | 2 +- template/bot_config_template.toml | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py index feab92cf0..1bc3e7dda 100644 --- a/src/chat/utils/utils_voice.py +++ b/src/chat/utils/utils_voice.py @@ -11,6 +11,9 @@ logger = get_logger("chat_voice") async def get_voice_text(voice_base64: str) -> str: """获取音频文件描述""" + if not global_config.chat.enable_asr: + logger.warning("语音识别未启用,无法处理语音消息") + return "[语音]" try: # 解码base64音频数据 # 确保base64字符串只包含ASCII字符 diff --git a/src/config/official_configs.py b/src/config/official_configs.py index 68d9468e1..be3ac1834 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -106,6 +106,9 @@ class ChatConfig(ConfigBase): focus_value: float = 1.0 """麦麦的专注思考能力,越低越容易专注,消耗token也越多""" + enable_asr: bool = False + """是否启用语音识别""" + def get_current_talk_frequency(self, chat_stream_id: Optional[str] = None) -> float: """ 根据当前时间和聊天流获取对应的 talk_frequency diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 511835c83..215b0f739 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -684,7 +684,7 @@ class LLMRequest: data.add_field( "file",io.BytesIO(file_bytes), filename=f"file.{file_format}", - content_type=f'{content_type_list[file_format]}' # 根据实际文件类型设置 + content_type=f'{content_type}' # 根据实际文件类型设置 ) data.add_field( "model", self.model_name diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 87110f329..3b21dae38 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -87,6 +87,7 @@ talk_frequency_adjust = [ # - 时间支持跨天,例如 "00:10,0.3" 表示从凌晨0:10开始使用频率0.3 # - 系统会自动将 "platform:id:type" 转换为内部的哈希chat_id进行匹配 +enable_asr = false # 是否启用语音识别,启用后麦麦可以通过语音输入进行对话,启用该功能需要配置语音识别模型[model.voice] [message_receive] # 以下是消息过滤,可以根据规则过滤特定消息,将不会读取这些消息 From 93f150f95ee0d8e875366779c5295f14cebb8f03 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 18 Jul 2025 13:11:10 +0800 Subject: [PATCH 7/7] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=863=E5=A4=84?= =?UTF-8?q?=E7=B1=BB=E5=9E=8B=E6=A0=87=E6=B3=A8=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/llm_models/utils_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 215b0f739..1f90a730a 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -216,7 +216,7 @@ class LLMRequest: prompt: str = None, image_base64: str = None, image_format: str = None, - file_bytes: str = None, + file_bytes: bytes = None, file_format: str = None, payload: dict = None, retry_policy: dict = None, @@ -296,7 +296,7 @@ class LLMRequest: prompt: str = None, image_base64: str = None, image_format: str = None, - file_bytes: str = None, + file_bytes: bytes = None, file_format: str = None, payload: dict = None, retry_policy: dict = None,