feat(client): 优化连接池配置以支持高并发embedding请求

refactor(request): 移除全局锁，改用信号量控制并发度
2025-11-09 20:16:47 +08:00
parent 58b746f217
commit ee44b02f93
2 changed files with 44 additions and 27 deletions
--- a/src/llm_models/model_client/openai_client.py
+++ b/src/llm_models/model_client/openai_client.py
@@ -433,11 +433,22 @@ class OpenaiClient(BaseClient):
            f"创建新的 AsyncOpenAI 客户端实例 (base_url={self.api_provider.base_url}, config_hash={self._config_hash}, loop_id={current_loop_id})"
        )

+        # 🔧 优化：增加连接池限制，支持高并发embedding请求
+        # 默认httpx限制为100，对于高频embedding场景不够用
+        import httpx
+        
+        limits = httpx.Limits(
+            max_keepalive_connections=200,  # 保持活跃连接数（原100）
+            max_connections=300,  # 最大总连接数（原100）
+            keepalive_expiry=30.0,  # 连接保活时间
+        )
+
        client = AsyncOpenAI(
            base_url=self.api_provider.base_url,
            api_key=self.api_provider.get_api_key(),
            max_retries=0,
            timeout=self.api_provider.timeout,
+            http_client=httpx.AsyncClient(limits=limits),  # 🔧 自定义连接池配置
        )

        # 存入全局缓存（带事件循环ID）
--- a/src/llm_models/utils_model.py
+++ b/src/llm_models/utils_model.py
@@ -802,7 +802,11 @@ class LLMRequest:
            for model in self.model_for_task.model_list
        }
        """模型使用量记录"""
-        self._lock = asyncio.Lock()
+        # 🔧 优化：移除全局锁，改用信号量控制并发度（允许多个请求并行）
+        # 默认允许50个并发请求，可通过配置调整
+        max_concurrent = getattr(model_set, "max_concurrent_requests", 50)
+        self._semaphore = asyncio.Semaphore(max_concurrent)
+        self._stats_lock = asyncio.Lock()  # 只保护统计数据的写入

        # 初始化辅助类
        self._model_selector = _ModelSelector(self.model_for_task.model_list, self.model_usage)
@@ -931,7 +935,6 @@ class LLMRequest:
        tools: list[dict[str, Any]] | None = None,
        raise_when_empty: bool = True,
    ) -> tuple[str, tuple[str, str, list[ToolCall] | None]]:
-        async with self._lock:
        """
        执行单次文本生成请求的内部方法。
        这是 `generate_response_async` 的核心实现，处理单个请求的完整生命周期，
@@ -948,6 +951,8 @@ class LLMRequest:
            Tuple[str, Tuple[str, str, Optional[List[ToolCall]]]]:
                (响应内容, (推理过程, 模型名称, 工具调用))
        """
+        # 🔧 优化：使用信号量控制并发，允许多个请求并行执行
+        async with self._semaphore:
            start_time = time.time()
            tool_options = await self._build_tool_options(tools)

@@ -1006,7 +1011,8 @@ class LLMRequest:
            endpoint (str): 请求的API端点 (e.g., "/chat/completions")。
        """
        if usage:
-            # 步骤1: 更新内存中的统计数据，用于负载均衡
+            # 步骤1: 更新内存中的统计数据，用于负载均衡（需要加锁保护）
+            async with self._stats_lock:
                stats = self.model_usage[model_info.name]

                # 计算新的平均延迟
@@ -1019,7 +1025,7 @@ class LLMRequest:
                    request_count=new_request_count,
                )

-            # 步骤2: 创建一个后台任务，将用量数据异步写入数据库
+            # 步骤2: 创建一个后台任务，将用量数据异步写入数据库（无需等待）
            asyncio.create_task(  # noqa: RUF006
                llm_usage_recorder.record_usage_to_database(
                    model_info=model_info,