Gemini音频转录功能，以及尝试防止空回复

2025-08-04 20:12:24 +08:00
parent 998eed4a43
commit cbe244d8f6
1 changed files with 67 additions and 3 deletions
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -16,6 +16,9 @@ from google.genai.types import (
    GenerateContentConfig,
    EmbedContentResponse,
    EmbedContentConfig,
+    SafetySetting,
+    HarmCategory,
+    HarmBlockThreshold,
 )
 from google.genai.errors import (
    ClientError,
@@ -41,6 +44,14 @@ from ..payload_content.tool_option import ToolOption, ToolParam, ToolCall

 logger = get_logger("Gemini客户端")

+gemini_safe_settings = [
+    SafetySetting(category=HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=HarmBlockThreshold.BLOCK_NONE),
+    SafetySetting(category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=HarmBlockThreshold.BLOCK_NONE),
+    SafetySetting(category=HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=HarmBlockThreshold.BLOCK_NONE),
+    SafetySetting(category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=HarmBlockThreshold.BLOCK_NONE),
+    SafetySetting(category=HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, threshold=HarmBlockThreshold.BLOCK_NONE),
+]
+

 def _convert_messages(
    messages: list[Message],
@@ -322,7 +333,7 @@ class GeminiClient(BaseClient):
        message_list: list[Message],
        tool_options: list[ToolOption] | None = None,
        max_tokens: int = 1024,
-        temperature: float = 0.7,
+        temperature: float = 0.4,
        response_format: RespFormat | None = None,
        stream_response_handler: Optional[
            Callable[
@@ -369,9 +380,12 @@ class GeminiClient(BaseClient):
            "thinking_config": ThinkingConfig(
                include_thoughts=True,
                thinking_budget=(
-                    extra_params["thinking_budget"] if extra_params and "thinking_budget" in extra_params else None
+                    extra_params["thinking_budget"]
+                    if extra_params and "thinking_budget" in extra_params
+                    else int(max_tokens / 2)  # 默认思考预算为最大token数的一半，防止空回复
                ),
            ),
+            "safety_settings": gemini_safe_settings,  # 防止空回复问题
        }
        if tools:
            generation_config_dict["tools"] = Tool(function_declarations=tools)
@@ -486,7 +500,57 @@ class GeminiClient(BaseClient):
    def get_audio_transcriptions(
        self, model_info: ModelInfo, audio_base64: str, extra_params: dict[str, Any] | None = None
    ) -> APIResponse:
-        raise NotImplementedError("尚未实现音频转录功能")
+        """
+        获取音频转录
+        :param model_info: 模型信息
+        :param audio_base64: 音频文件的Base64编码字符串
+        :param extra_params: 额外参数（可选）
+        :return: 转录响应
+        """
+        generation_config_dict = {
+            "max_output_tokens": 2048,
+            "response_modalities": ["TEXT"],
+            "thinking_config": ThinkingConfig(
+                include_thoughts=True,
+                thinking_budget=(
+                    extra_params["thinking_budget"] if extra_params and "thinking_budget" in extra_params else 1024
+                ),
+            ),
+            "safety_settings": gemini_safe_settings,
+        }
+        generate_content_config = GenerateContentConfig(**generation_config_dict)
+        prompt = "Generate a transcript of the speech. The language of the transcript should **match the language of the speech**."
+        try:
+            raw_response: GenerateContentResponse = self.client.models.generate_content(
+                model=model_info.model_identifier,
+                contents=[
+                    Content(
+                        role="user",
+                        parts=[
+                            Part.from_text(text=prompt),
+                            Part.from_bytes(data=base64.b64decode(audio_base64), mime_type="audio/wav"),
+                        ],
+                    )
+                ],
+                config=generate_content_config,
+            )
+            resp, usage_record = _default_normal_response_parser(raw_response)
+        except (ClientError, ServerError) as e:
+            # 重封装ClientError和ServerError为RespNotOkException
+            raise RespNotOkException(e.code) from None
+        except Exception as e:
+            raise NetworkConnectionError() from e
+
+        if usage_record:
+            resp.usage = UsageRecord(
+                model_name=model_info.name,
+                provider_name=model_info.api_provider,
+                prompt_tokens=usage_record[0],
+                completion_tokens=usage_record[1],
+                total_tokens=usage_record[2],
+            )
+
+        return resp

    def get_support_image_formats(self) -> list[str]:
        """