通过以下改动修复嵌入生成过程中的事件循环相关问题: - 在 EmbeddingStore._get_embedding 中,改为同步创建-使用-销毁的新事件循环模式,彻底避免嵌套事件循环问题 - 调整批量嵌入 _get_embeddings_batch_threaded,确保每个线程使用独立、短生命周期的事件循环 - 新增 force_new 参数,LLM 请求嵌入任务时强制创建新的客户端实例,减少跨循环对象复用 - 在 OpenAI 客户端的 embedding 调用处补充详细日志,方便排查网络连接异常 - get_embedding() 每次都重建 LLMRequest,降低实例在多个事件循环中穿梭的概率 此次改动虽然以同步风格“硬掰”异步接口,但对现有接口零破坏,确保了向量数据库及相关知识检索功能的稳定性。(还有就是把的脚本文件夹移回来了)
75 lines
2.3 KiB
Python
75 lines
2.3 KiB
Python
import os
|
||
from pathlib import Path
|
||
import sys # 新增系统模块导入
|
||
from src.chat.knowledge.utils.hash import get_sha256
|
||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||
from src.common.logger import get_logger
|
||
|
||
logger = get_logger("lpmm")
|
||
ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||
RAW_DATA_PATH = os.path.join(ROOT_PATH, "data/lpmm_raw_data")
|
||
# IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data/imported_lpmm_data")
|
||
|
||
def _process_text_file(file_path):
|
||
"""处理单个文本文件,返回段落列表"""
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
raw = f.read()
|
||
|
||
paragraphs = []
|
||
paragraph = ""
|
||
for line in raw.split("\n"):
|
||
if line.strip() == "":
|
||
if paragraph != "":
|
||
paragraphs.append(paragraph.strip())
|
||
paragraph = ""
|
||
else:
|
||
paragraph += line + "\n"
|
||
|
||
if paragraph != "":
|
||
paragraphs.append(paragraph.strip())
|
||
|
||
return paragraphs
|
||
|
||
|
||
def _process_multi_files() -> list:
|
||
raw_files = list(Path(RAW_DATA_PATH).glob("*.txt"))
|
||
if not raw_files:
|
||
logger.warning("警告: data/lpmm_raw_data 中没有找到任何 .txt 文件")
|
||
sys.exit(1)
|
||
# 处理所有文件
|
||
all_paragraphs = []
|
||
for file in raw_files:
|
||
logger.info(f"正在处理文件: {file.name}")
|
||
paragraphs = _process_text_file(file)
|
||
all_paragraphs.extend(paragraphs)
|
||
return all_paragraphs
|
||
|
||
def load_raw_data() -> tuple[list[str], list[str]]:
|
||
"""加载原始数据文件
|
||
|
||
读取原始数据文件,将原始数据加载到内存中
|
||
|
||
Args:
|
||
path: 可选,指定要读取的json文件绝对路径
|
||
|
||
Returns:
|
||
- raw_data: 原始数据列表
|
||
- sha256_list: 原始数据的SHA256集合
|
||
"""
|
||
raw_data = _process_multi_files()
|
||
sha256_list = []
|
||
sha256_set = set()
|
||
for item in raw_data:
|
||
if not isinstance(item, str):
|
||
logger.warning(f"数据类型错误:{item}")
|
||
continue
|
||
pg_hash = get_sha256(item)
|
||
if pg_hash in sha256_set:
|
||
logger.warning(f"重复数据:{item}")
|
||
continue
|
||
sha256_set.add(pg_hash)
|
||
sha256_list.append(pg_hash)
|
||
raw_data.append(item)
|
||
logger.info(f"共读取到{len(raw_data)}条数据")
|
||
|
||
return sha256_list, raw_data |