From 25cb8d41bb95cdfebec58fd5334f0d75b3703c23 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 1 Aug 2025 03:32:00 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E8=AF=AD=E9=9F=B3?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/utils/utils_voice.py | 7 +--- src/llm_models/model_client/base_client.py | 14 ++++++++ src/llm_models/model_client/openai_client.py | 34 ++++++++++++++++++ src/llm_models/payload_content/message.py | 17 ++++++++- src/llm_models/utils_model.py | 37 ++++++++++++++++++-- 5 files changed, 99 insertions(+), 10 deletions(-) diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py index baff40916..7093c1348 100644 --- a/src/chat/utils/utils_voice.py +++ b/src/chat/utils/utils_voice.py @@ -15,13 +15,8 @@ async def get_voice_text(voice_base64: str) -> str: logger.warning("语音识别未启用,无法处理语音消息") return "[语音]" try: - # 解码base64音频数据 - # 确保base64字符串只包含ASCII字符 - if isinstance(voice_base64, str): - voice_base64 = voice_base64.encode("ascii", errors="ignore").decode("ascii") - voice_bytes = base64.b64decode(voice_base64) _llm = LLMRequest(model_set=model_config.model_task_config.voice, request_type="voice") - text = await _llm.generate_response_for_voice(voice_bytes) + text = await _llm.generate_response_for_voice(voice_base64) if text is None: logger.warning("未能生成语音文本") return "[语音(文本生成失败)]" diff --git a/src/llm_models/model_client/base_client.py b/src/llm_models/model_client/base_client.py index 0ca092447..1bc653699 100644 --- a/src/llm_models/model_client/base_client.py +++ b/src/llm_models/model_client/base_client.py @@ -113,6 +113,20 @@ class BaseClient: :return: 嵌入响应 """ raise RuntimeError("This method should be overridden in subclasses") + + async def get_audio_transcriptions( + self, + model_info: ModelInfo, + message_list: list[Message], + extra_params: dict[str, Any] | None = None, + ) -> APIResponse: + """ + 获取音频转录 + :param model_info: 模型信息 + :param message_list: 消息列表,包含音频内容 + :return: 音频转录响应 + """ + raise RuntimeError("This method should be overridden in subclasses") class ClientRegistry: diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py index c8483eba9..a8ba145e2 100644 --- a/src/llm_models/model_client/openai_client.py +++ b/src/llm_models/model_client/openai_client.py @@ -532,3 +532,37 @@ class OpenaiClient(BaseClient): ) return response + + async def get_audio_transcriptions( + self, + model_info: ModelInfo, + message_list: list[Message], + extra_params: dict[str, Any] | None = None, + ) -> APIResponse: + """ + 获取音频转录 + :param model_info: 模型信息 + :param audio_base64: 音频的base64编码 + :return: 转录响应 + """ + try: + raw_response = await self.client.audio.transcriptions.create( + model=model_info.model_identifier, + file=message_list[0].content[0], + extra_body=extra_params + ) + except APIConnectionError as e: + raise NetworkConnectionError() from e + except APIStatusError as e: + # 重封装APIError为RespNotOkException + raise RespNotOkException(e.status_code) from e + response = APIResponse() + # 解析转录响应 + if hasattr(raw_response, "text"): + response.content = raw_response.text + else: + raise RespParseException( + raw_response, + "响应解析失败,缺失转录文本。", + ) + return response \ No newline at end of file diff --git a/src/llm_models/payload_content/message.py b/src/llm_models/payload_content/message.py index 26202ca11..d6a960a3f 100644 --- a/src/llm_models/payload_content/message.py +++ b/src/llm_models/payload_content/message.py @@ -1,5 +1,6 @@ +import base64 from enum import Enum - +from io import BytesIO # 设计这系列类的目的是为未来可能的扩展做准备 @@ -54,6 +55,20 @@ class MessageBuilder: self.__content.append(text) return self + def add_file_content( + self, file_name: str, file_base64: str + ) -> "MessageBuilder": + """ + 添加文件内容 + :param file_name: 文件名(包含类型后缀) + :param file_base64: 文件的base64编码 + :return: MessageBuilder对象 + """ + if not file_name or not file_base64: + raise ValueError("文件名和base64编码不能为空") + self.__content.append((file_name, BytesIO(base64.b64decode(file_base64)))) + return self + def add_image_content( self, image_format: str, image_base64: str ) -> "MessageBuilder": diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index bc813a58a..8e9bafeb4 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -38,7 +38,7 @@ class RequestType(Enum): RESPONSE = "response" EMBEDDING = "embedding" - + AUDIO = "audio" class LLMRequest: """LLM请求类""" @@ -106,8 +106,32 @@ class LLMRequest: ) return content, (reasoning_content, model_info.name, tool_calls) - async def generate_response_for_voice(self): - pass + async def generate_response_for_voice(self, voice_base64: str) -> Optional[str]: + """ + 为语音生成响应 + Args: + voice_base64 (str): 语音的Base64编码字符串 + Returns: + (Optional[str]): 生成的文本描述或None + """ + # 请求体构建 + message_builder = MessageBuilder() + message_builder.add_file_content(file_name="audio.wav", file_base64=voice_base64) + messages = [message_builder.build()] + + # 模型选择 + model_info, api_provider, client = self._select_model() + + # 请求并处理返回值 + response = await self._execute_request( + api_provider=api_provider, + client=client, + request_type=RequestType.AUDIO, + model_info=model_info, + message_list=messages, + ) + return response.content or None + async def generate_response_async( self, @@ -255,6 +279,13 @@ class LLMRequest: embedding_input=embedding_input, extra_params=model_info.extra_params, ) + elif request_type == RequestType.AUDIO: + assert message_list is not None, "message_list cannot be None for audio requests" + return await client.get_audio_transcriptions( + model_info=model_info, + message_list=message_list, + extra_params=model_info.extra_params, + ) except Exception as e: logger.debug(f"请求失败: {str(e)}") # 处理异常