From 25cb8d41bb95cdfebec58fd5334f0d75b3703c23 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 1 Aug 2025 03:32:00 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E8=AF=AD?= =?UTF-8?q?=E9=9F=B3=E8=AF=86=E5=88=AB=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/utils/utils_voice.py | 7 +--- src/llm_models/model_client/base_client.py | 14 ++++++++ src/llm_models/model_client/openai_client.py | 34 ++++++++++++++++++ src/llm_models/payload_content/message.py | 17 ++++++++- src/llm_models/utils_model.py | 37 ++++++++++++++++++-- 5 files changed, 99 insertions(+), 10 deletions(-) diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py index baff40916..7093c1348 100644 --- a/src/chat/utils/utils_voice.py +++ b/src/chat/utils/utils_voice.py @@ -15,13 +15,8 @@ async def get_voice_text(voice_base64: str) -> str: logger.warning("语音识别未启用,无法处理语音消息") return "[语音]" try: - # 解码base64音频数据 - # 确保base64字符串只包含ASCII字符 - if isinstance(voice_base64, str): - voice_base64 = voice_base64.encode("ascii", errors="ignore").decode("ascii") - voice_bytes = base64.b64decode(voice_base64) _llm = LLMRequest(model_set=model_config.model_task_config.voice, request_type="voice") - text = await _llm.generate_response_for_voice(voice_bytes) + text = await _llm.generate_response_for_voice(voice_base64) if text is None: logger.warning("未能生成语音文本") return "[语音(文本生成失败)]" diff --git a/src/llm_models/model_client/base_client.py b/src/llm_models/model_client/base_client.py index 0ca092447..1bc653699 100644 --- a/src/llm_models/model_client/base_client.py +++ b/src/llm_models/model_client/base_client.py @@ -113,6 +113,20 @@ class BaseClient: :return: 嵌入响应 """ raise RuntimeError("This method should be overridden in subclasses") + + async def get_audio_transcriptions( + self, + model_info: ModelInfo, + message_list: list[Message], + extra_params: dict[str, Any] | None = None, + ) -> APIResponse: + """ + 获取音频转录 + :param model_info: 模型信息 + :param message_list: 消息列表,包含音频内容 + :return: 音频转录响应 + """ + raise RuntimeError("This method should be overridden in subclasses") class ClientRegistry: diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py index c8483eba9..a8ba145e2 100644 --- a/src/llm_models/model_client/openai_client.py +++ b/src/llm_models/model_client/openai_client.py @@ -532,3 +532,37 @@ class OpenaiClient(BaseClient): ) return response + + async def get_audio_transcriptions( + self, + model_info: ModelInfo, + message_list: list[Message], + extra_params: dict[str, Any] | None = None, + ) -> APIResponse: + """ + 获取音频转录 + :param model_info: 模型信息 + :param audio_base64: 音频的base64编码 + :return: 转录响应 + """ + try: + raw_response = await self.client.audio.transcriptions.create( + model=model_info.model_identifier, + file=message_list[0].content[0], + extra_body=extra_params + ) + except APIConnectionError as e: + raise NetworkConnectionError() from e + except APIStatusError as e: + # 重封装APIError为RespNotOkException + raise RespNotOkException(e.status_code) from e + response = APIResponse() + # 解析转录响应 + if hasattr(raw_response, "text"): + response.content = raw_response.text + else: + raise RespParseException( + raw_response, + "响应解析失败,缺失转录文本。", + ) + return response \ No newline at end of file diff --git a/src/llm_models/payload_content/message.py b/src/llm_models/payload_content/message.py index 26202ca11..d6a960a3f 100644 --- a/src/llm_models/payload_content/message.py +++ b/src/llm_models/payload_content/message.py @@ -1,5 +1,6 @@ +import base64 from enum import Enum - +from io import BytesIO # 设计这系列类的目的是为未来可能的扩展做准备 @@ -54,6 +55,20 @@ class MessageBuilder: self.__content.append(text) return self + def add_file_content( + self, file_name: str, file_base64: str + ) -> "MessageBuilder": + """ + 添加文件内容 + :param file_name: 文件名(包含类型后缀) + :param file_base64: 文件的base64编码 + :return: MessageBuilder对象 + """ + if not file_name or not file_base64: + raise ValueError("文件名和base64编码不能为空") + self.__content.append((file_name, BytesIO(base64.b64decode(file_base64)))) + return self + def add_image_content( self, image_format: str, image_base64: str ) -> "MessageBuilder": diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index bc813a58a..8e9bafeb4 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -38,7 +38,7 @@ class RequestType(Enum): RESPONSE = "response" EMBEDDING = "embedding" - + AUDIO = "audio" class LLMRequest: """LLM请求类""" @@ -106,8 +106,32 @@ class LLMRequest: ) return content, (reasoning_content, model_info.name, tool_calls) - async def generate_response_for_voice(self): - pass + async def generate_response_for_voice(self, voice_base64: str) -> Optional[str]: + """ + 为语音生成响应 + Args: + voice_base64 (str): 语音的Base64编码字符串 + Returns: + (Optional[str]): 生成的文本描述或None + """ + # 请求体构建 + message_builder = MessageBuilder() + message_builder.add_file_content(file_name="audio.wav", file_base64=voice_base64) + messages = [message_builder.build()] + + # 模型选择 + model_info, api_provider, client = self._select_model() + + # 请求并处理返回值 + response = await self._execute_request( + api_provider=api_provider, + client=client, + request_type=RequestType.AUDIO, + model_info=model_info, + message_list=messages, + ) + return response.content or None + async def generate_response_async( self, @@ -255,6 +279,13 @@ class LLMRequest: embedding_input=embedding_input, extra_params=model_info.extra_params, ) + elif request_type == RequestType.AUDIO: + assert message_list is not None, "message_list cannot be None for audio requests" + return await client.get_audio_transcriptions( + model_info=model_info, + message_list=message_list, + extra_params=model_info.extra_params, + ) except Exception as e: logger.debug(f"请求失败: {str(e)}") # 处理异常 From 49af7b0c6570b242fc0e07ac1d1d3c6a76f127e4 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 1 Aug 2025 03:40:24 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E7=9A=84=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/llm_models/model_client/openai_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py index a8ba145e2..1bcd54bf2 100644 --- a/src/llm_models/model_client/openai_client.py +++ b/src/llm_models/model_client/openai_client.py @@ -542,7 +542,7 @@ class OpenaiClient(BaseClient): """ 获取音频转录 :param model_info: 模型信息 - :param audio_base64: 音频的base64编码 + :param message_list: 消息列表,包含音频内容 :return: 转录响应 """ try: From 70e12122b605fdde028f1ad6019a5596b37a424a Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 1 Aug 2025 03:42:30 +0800 Subject: [PATCH 3/4] typing --- src/llm_models/payload_content/message.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llm_models/payload_content/message.py b/src/llm_models/payload_content/message.py index d6a960a3f..71ab67389 100644 --- a/src/llm_models/payload_content/message.py +++ b/src/llm_models/payload_content/message.py @@ -34,7 +34,7 @@ class Message: class MessageBuilder: def __init__(self): self.__role: RoleType = RoleType.User - self.__content: list[tuple[str, str] | str] = [] + self.__content: list[tuple[str, str] | str | tuple[str, BytesIO]] = [] self.__tool_call_id: str | None = None def set_role(self, role: RoleType = RoleType.User) -> "MessageBuilder": From fcaa78f9a5fc0d32f7b37178d84b5a49b8f974c3 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 1 Aug 2025 12:49:09 +0800 Subject: [PATCH 4/4] =?UTF-8?q?=E5=88=A0=E9=99=A4=E4=BA=86add=5Ffile=5Fcon?= =?UTF-8?q?tent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/llm_models/model_client/base_client.py | 5 +++-- src/llm_models/model_client/openai_client.py | 10 ++++++---- src/llm_models/payload_content/message.py | 19 ++----------------- src/llm_models/utils_model.py | 10 +++------- 4 files changed, 14 insertions(+), 30 deletions(-) diff --git a/src/llm_models/model_client/base_client.py b/src/llm_models/model_client/base_client.py index 1bc653699..b06f846a4 100644 --- a/src/llm_models/model_client/base_client.py +++ b/src/llm_models/model_client/base_client.py @@ -117,13 +117,14 @@ class BaseClient: async def get_audio_transcriptions( self, model_info: ModelInfo, - message_list: list[Message], + audio_base64: str, extra_params: dict[str, Any] | None = None, ) -> APIResponse: """ 获取音频转录 :param model_info: 模型信息 - :param message_list: 消息列表,包含音频内容 + :param audio_base64: base64编码的音频数据 + :extra_params: 附加的请求参数 :return: 音频转录响应 """ raise RuntimeError("This method should be overridden in subclasses") diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py index 1bcd54bf2..d7a923faf 100644 --- a/src/llm_models/model_client/openai_client.py +++ b/src/llm_models/model_client/openai_client.py @@ -2,6 +2,7 @@ import asyncio import io import json import re +import base64 from collections.abc import Iterable from typing import Callable, Any, Coroutine, Optional from json_repair import repair_json @@ -536,19 +537,20 @@ class OpenaiClient(BaseClient): async def get_audio_transcriptions( self, model_info: ModelInfo, - message_list: list[Message], + audio_base64: str, extra_params: dict[str, Any] | None = None, ) -> APIResponse: """ 获取音频转录 :param model_info: 模型信息 - :param message_list: 消息列表,包含音频内容 - :return: 转录响应 + :param audio_base64: base64编码的音频数据 + :extra_params: 附加的请求参数 + :return: 音频转录响应 """ try: raw_response = await self.client.audio.transcriptions.create( model=model_info.model_identifier, - file=message_list[0].content[0], + file=("audio.wav", io.BytesIO(base64.b64decode(audio_base64))), extra_body=extra_params ) except APIConnectionError as e: diff --git a/src/llm_models/payload_content/message.py b/src/llm_models/payload_content/message.py index 71ab67389..e07f473b8 100644 --- a/src/llm_models/payload_content/message.py +++ b/src/llm_models/payload_content/message.py @@ -1,6 +1,5 @@ -import base64 from enum import Enum -from io import BytesIO + # 设计这系列类的目的是为未来可能的扩展做准备 @@ -34,7 +33,7 @@ class Message: class MessageBuilder: def __init__(self): self.__role: RoleType = RoleType.User - self.__content: list[tuple[str, str] | str | tuple[str, BytesIO]] = [] + self.__content: list[tuple[str, str] | str] = [] self.__tool_call_id: str | None = None def set_role(self, role: RoleType = RoleType.User) -> "MessageBuilder": @@ -54,20 +53,6 @@ class MessageBuilder: """ self.__content.append(text) return self - - def add_file_content( - self, file_name: str, file_base64: str - ) -> "MessageBuilder": - """ - 添加文件内容 - :param file_name: 文件名(包含类型后缀) - :param file_base64: 文件的base64编码 - :return: MessageBuilder对象 - """ - if not file_name or not file_base64: - raise ValueError("文件名和base64编码不能为空") - self.__content.append((file_name, BytesIO(base64.b64decode(file_base64)))) - return self def add_image_content( self, image_format: str, image_base64: str diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 8e9bafeb4..53cc7aaae 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -114,11 +114,6 @@ class LLMRequest: Returns: (Optional[str]): 生成的文本描述或None """ - # 请求体构建 - message_builder = MessageBuilder() - message_builder.add_file_content(file_name="audio.wav", file_base64=voice_base64) - messages = [message_builder.build()] - # 模型选择 model_info, api_provider, client = self._select_model() @@ -128,7 +123,7 @@ class LLMRequest: client=client, request_type=RequestType.AUDIO, model_info=model_info, - message_list=messages, + audio_base64=voice_base64, ) return response.content or None @@ -249,6 +244,7 @@ class LLMRequest: temperature: Optional[float] = None, max_tokens: Optional[int] = None, embedding_input: str = "", + audio_base64: str = "" ) -> APIResponse: """ 实际执行请求的方法 @@ -283,7 +279,7 @@ class LLMRequest: assert message_list is not None, "message_list cannot be None for audio requests" return await client.get_audio_transcriptions( model_info=model_info, - message_list=message_list, + audio_base64=audio_base64, extra_params=model_info.extra_params, ) except Exception as e: