From 2c93b2dac8ec12e0d0630ae5830e4fa25aac54a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A2=A8=E6=A2=93=E6=9F=92?= <1787882683@qq.com> Date: Sun, 3 Aug 2025 11:31:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4lpmmconfig.py=E5=92=8Craw=5Fp?= =?UTF-8?q?rocessing.py=E6=96=87=E4=BB=B6=EF=BC=8C=E7=A7=BB=E9=99=A4?= =?UTF-8?q?=E4=B8=8D=E5=86=8D=E4=BD=BF=E7=94=A8=E7=9A=84=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E5=92=8C=E6=95=B0=E6=8D=AE=E5=8A=A0=E8=BD=BD=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/knowledge/lpmmconfig.py | 21 ------------ src/chat/knowledge/raw_processing.py | 48 ---------------------------- 2 files changed, 69 deletions(-) delete mode 100644 src/chat/knowledge/lpmmconfig.py delete mode 100644 src/chat/knowledge/raw_processing.py diff --git a/src/chat/knowledge/lpmmconfig.py b/src/chat/knowledge/lpmmconfig.py deleted file mode 100644 index 12e8474fe..000000000 --- a/src/chat/knowledge/lpmmconfig.py +++ /dev/null @@ -1,21 +0,0 @@ -PG_NAMESPACE = "paragraph" -ENT_NAMESPACE = "entity" -REL_NAMESPACE = "relation" - -RAG_GRAPH_NAMESPACE = "rag-graph" -RAG_ENT_CNT_NAMESPACE = "rag-ent-cnt" -RAG_PG_HASH_NAMESPACE = "rag-pg-hash" - -# 无效实体 -INVALID_ENTITY = [ - "", - "你", - "他", - "她", - "它", - "我们", - "你们", - "他们", - "她们", - "它们", -] \ No newline at end of file diff --git a/src/chat/knowledge/raw_processing.py b/src/chat/knowledge/raw_processing.py deleted file mode 100644 index 98b1f1687..000000000 --- a/src/chat/knowledge/raw_processing.py +++ /dev/null @@ -1,48 +0,0 @@ -import json -import os - -from .global_logger import logger -from .lpmmconfig import global_config -from src.chat.knowledge.utils.hash import get_sha256 - - -def load_raw_data(path: str = None) -> tuple[list[str], list[str]]: - """加载原始数据文件 - - 读取原始数据文件,将原始数据加载到内存中 - - Args: - path: 可选,指定要读取的json文件绝对路径 - - Returns: - - raw_data: 原始数据列表 - - sha256_list: 原始数据的SHA256集合 - """ - # 读取指定路径或默认路径的json文件 - json_path = path if path else global_config["persistence"]["raw_data_path"] - if os.path.exists(json_path): - with open(json_path, "r", encoding="utf-8") as f: - import_json = json.loads(f.read()) - else: - raise Exception(f"原始数据文件读取失败: {json_path}") - """ - import_json 内容示例: - import_json = ["The capital of China is Beijing. The capital of France is Paris.",] - """ - raw_data = [] - sha256_list = [] - sha256_set = set() - for item in import_json: - if not isinstance(item, str): - logger.warning("数据类型错误:{}".format(item)) - continue - pg_hash = get_sha256(item) - if pg_hash in sha256_set: - logger.warning("重复数据:{}".format(item)) - continue - sha256_set.add(pg_hash) - sha256_list.append(pg_hash) - raw_data.append(item) - logger.info("共读取到{}条数据".format(len(raw_data))) - - return sha256_list, raw_data