优化多线程处理,调整嵌入获取和存储逻辑,增强模型一致性校验

This commit is contained in:
墨梓柒
2025-08-02 23:52:41 +08:00
parent 9b65b740be
commit 423525ead5
3 changed files with 238 additions and 93 deletions

View File

@@ -6,7 +6,6 @@ from src.chat.knowledge.qa_manager import QAManager
from src.chat.knowledge.kg_manager import KGManager
from src.chat.knowledge.global_logger import logger
from src.config.config import global_config as bot_global_config
from src.manager.local_store_manager import local_storage
import os
INVALID_ENTITY = [
@@ -21,9 +20,6 @@ INVALID_ENTITY = [
"她们",
"它们",
]
PG_NAMESPACE = "paragraph"
ENT_NAMESPACE = "entity"
REL_NAMESPACE = "relation"
RAG_GRAPH_NAMESPACE = "rag-graph"
RAG_ENT_CNT_NAMESPACE = "rag-ent-cnt"
@@ -34,54 +30,6 @@ ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..",
DATA_PATH = os.path.join(ROOT_PATH, "data")
def _initialize_knowledge_local_storage():
"""
初始化知识库相关的本地存储配置
使用字典批量设置避免重复的if判断
"""
# 定义所有需要初始化的配置项
default_configs = {
# 路径配置
"root_path": ROOT_PATH,
"data_path": f"{ROOT_PATH}/data",
# 实体和命名空间配置
"lpmm_invalid_entity": INVALID_ENTITY,
"pg_namespace": PG_NAMESPACE,
"ent_namespace": ENT_NAMESPACE,
"rel_namespace": REL_NAMESPACE,
# RAG相关命名空间配置
"rag_graph_namespace": RAG_GRAPH_NAMESPACE,
"rag_ent_cnt_namespace": RAG_ENT_CNT_NAMESPACE,
"rag_pg_hash_namespace": RAG_PG_HASH_NAMESPACE,
}
# 日志级别映射重要配置用info其他用debug
important_configs = {"root_path", "data_path"}
# 批量设置配置项
initialized_count = 0
for key, default_value in default_configs.items():
if local_storage[key] is None:
local_storage[key] = default_value
# 根据重要性选择日志级别
if key in important_configs:
logger.info(f"设置{key}: {default_value}")
else:
logger.debug(f"设置{key}: {default_value}")
initialized_count += 1
if initialized_count > 0:
logger.info(f"知识库本地存储初始化完成,共设置 {initialized_count} 项配置")
else:
logger.debug("知识库本地存储配置已存在,跳过初始化")
# 初始化本地存储路径
# sourcery skip: dict-comprehension
_initialize_knowledge_local_storage()
qa_manager = None
inspire_manager = None
@@ -120,7 +68,7 @@ if bot_global_config.lpmm_knowledge.enable:
# 数据比对Embedding库与KG的段落hash集合
for pg_hash in kg_manager.stored_paragraph_hashes:
key = f"{PG_NAMESPACE}-{pg_hash}"
key = f"paragraph-{pg_hash}"
if key not in embed_manager.stored_pg_hashes:
logger.warning(f"KG中存在Embedding库中不存在的段落{key}")