删除lpmmconfig.py和raw_processing.py文件,移除不再使用的配置和数据加载逻辑

This commit is contained in:
墨梓柒
2025-08-03 11:31:39 +08:00
parent e6855bbe56
commit 2c93b2dac8
2 changed files with 0 additions and 69 deletions

View File

@@ -1,21 +0,0 @@
PG_NAMESPACE = "paragraph"
ENT_NAMESPACE = "entity"
REL_NAMESPACE = "relation"
RAG_GRAPH_NAMESPACE = "rag-graph"
RAG_ENT_CNT_NAMESPACE = "rag-ent-cnt"
RAG_PG_HASH_NAMESPACE = "rag-pg-hash"
# 无效实体
INVALID_ENTITY = [
"",
"",
"",
"",
"",
"我们",
"你们",
"他们",
"她们",
"它们",
]

View File

@@ -1,48 +0,0 @@
import json
import os
from .global_logger import logger
from .lpmmconfig import global_config
from src.chat.knowledge.utils.hash import get_sha256
def load_raw_data(path: str = None) -> tuple[list[str], list[str]]:
"""加载原始数据文件
读取原始数据文件,将原始数据加载到内存中
Args:
path: 可选指定要读取的json文件绝对路径
Returns:
- raw_data: 原始数据列表
- sha256_list: 原始数据的SHA256集合
"""
# 读取指定路径或默认路径的json文件
json_path = path if path else global_config["persistence"]["raw_data_path"]
if os.path.exists(json_path):
with open(json_path, "r", encoding="utf-8") as f:
import_json = json.loads(f.read())
else:
raise Exception(f"原始数据文件读取失败: {json_path}")
"""
import_json 内容示例:
import_json = ["The capital of China is Beijing. The capital of France is Paris.",]
"""
raw_data = []
sha256_list = []
sha256_set = set()
for item in import_json:
if not isinstance(item, str):
logger.warning("数据类型错误:{}".format(item))
continue
pg_hash = get_sha256(item)
if pg_hash in sha256_set:
logger.warning("重复数据:{}".format(item))
continue
sha256_set.add(pg_hash)
sha256_list.append(pg_hash)
raw_data.append(item)
logger.info("共读取到{}条数据".format(len(raw_data)))
return sha256_list, raw_data