杀另一部分文件
This commit is contained in:
166
scripts/import_openie.py
Normal file
166
scripts/import_openie.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# try:
|
||||
# import src.plugins.knowledge.lib.quick_algo
|
||||
# except ImportError:
|
||||
# print("未找到quick_algo库,无法使用quick_algo算法")
|
||||
# print("请安装quick_algo库 - 在lib.quick_algo中,执行命令:python setup.py build_ext --inplace")
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
from typing import Dict, List
|
||||
|
||||
from src.plugins.knowledge.src.lpmmconfig import PG_NAMESPACE, global_config
|
||||
from src.plugins.knowledge.src.embedding_store import EmbeddingManager
|
||||
from src.plugins.knowledge.src.llm_client import LLMClient
|
||||
from src.plugins.knowledge.src.open_ie import OpenIE
|
||||
from src.plugins.knowledge.src.kg_manager import KGManager
|
||||
from src.common.logger import get_module_logger
|
||||
from src.plugins.knowledge.src.utils.hash import get_sha256
|
||||
|
||||
|
||||
# 添加项目根目录到 sys.path
|
||||
|
||||
|
||||
logger = get_module_logger("LPMM知识库-OpenIE导入")
|
||||
|
||||
|
||||
def hash_deduplicate(
|
||||
raw_paragraphs: Dict[str, str],
|
||||
triple_list_data: Dict[str, List[List[str]]],
|
||||
stored_pg_hashes: set,
|
||||
stored_paragraph_hashes: set,
|
||||
):
|
||||
"""Hash去重
|
||||
|
||||
Args:
|
||||
raw_paragraphs: 索引的段落原文
|
||||
triple_list_data: 索引的三元组列表
|
||||
stored_pg_hashes: 已存储的段落hash集合
|
||||
stored_paragraph_hashes: 已存储的段落hash集合
|
||||
|
||||
Returns:
|
||||
new_raw_paragraphs: 去重后的段落
|
||||
new_triple_list_data: 去重后的三元组
|
||||
"""
|
||||
# 保存去重后的段落
|
||||
new_raw_paragraphs = dict()
|
||||
# 保存去重后的三元组
|
||||
new_triple_list_data = dict()
|
||||
|
||||
for _, (raw_paragraph, triple_list) in enumerate(zip(raw_paragraphs.values(), triple_list_data.values())):
|
||||
# 段落hash
|
||||
paragraph_hash = get_sha256(raw_paragraph)
|
||||
if ((PG_NAMESPACE + "-" + paragraph_hash) in stored_pg_hashes) and (paragraph_hash in stored_paragraph_hashes):
|
||||
continue
|
||||
new_raw_paragraphs[paragraph_hash] = raw_paragraph
|
||||
new_triple_list_data[paragraph_hash] = triple_list
|
||||
|
||||
return new_raw_paragraphs, new_triple_list_data
|
||||
|
||||
|
||||
def handle_import_openie(openie_data: OpenIE, embed_manager: EmbeddingManager, kg_manager: KGManager) -> bool:
|
||||
# 从OpenIE数据中提取段落原文与三元组列表
|
||||
# 索引的段落原文
|
||||
raw_paragraphs = openie_data.extract_raw_paragraph_dict()
|
||||
# 索引的实体列表
|
||||
entity_list_data = openie_data.extract_entity_dict()
|
||||
# 索引的三元组列表
|
||||
triple_list_data = openie_data.extract_triple_dict()
|
||||
if len(raw_paragraphs) != len(entity_list_data) or len(raw_paragraphs) != len(triple_list_data):
|
||||
logger.error("OpenIE数据存在异常")
|
||||
return False
|
||||
# 将索引换为对应段落的hash值
|
||||
logger.info("正在进行段落去重与重索引")
|
||||
raw_paragraphs, triple_list_data = hash_deduplicate(
|
||||
raw_paragraphs,
|
||||
triple_list_data,
|
||||
embed_manager.stored_pg_hashes,
|
||||
kg_manager.stored_paragraph_hashes,
|
||||
)
|
||||
if len(raw_paragraphs) != 0:
|
||||
# 获取嵌入并保存
|
||||
logger.info(f"段落去重完成,剩余待处理的段落数量:{len(raw_paragraphs)}")
|
||||
logger.info("开始Embedding")
|
||||
embed_manager.store_new_data_set(raw_paragraphs, triple_list_data)
|
||||
# Embedding-Faiss重索引
|
||||
logger.info("正在重新构建向量索引")
|
||||
embed_manager.rebuild_faiss_index()
|
||||
logger.info("向量索引构建完成")
|
||||
embed_manager.save_to_file()
|
||||
logger.info("Embedding完成")
|
||||
# 构建新段落的RAG
|
||||
logger.info("开始构建RAG")
|
||||
kg_manager.build_kg(triple_list_data, embed_manager)
|
||||
kg_manager.save_to_file()
|
||||
logger.info("RAG构建完成")
|
||||
else:
|
||||
logger.info("无新段落需要处理")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
# 新增确认提示
|
||||
print("=== 重要操作确认 ===")
|
||||
print("OpenIE导入时会大量发送请求,可能会撞到请求速度上限,请注意选用的模型")
|
||||
print("同之前样例:在本地模型下,在70分钟内我们发送了约8万条请求,在网络允许下,速度会更快")
|
||||
print("推荐使用硅基流动的Pro/BAAI/bge-m3")
|
||||
print("每百万Token费用为0.7元")
|
||||
print("知识导入时,会消耗大量系统资源,建议在较好配置电脑上运行")
|
||||
print("同上样例,导入时10700K几乎跑满,14900HX占用80%,峰值内存占用约3G")
|
||||
confirm = input("确认继续执行?(y/n): ").strip().lower()
|
||||
if confirm != "y":
|
||||
logger.info("用户取消操作")
|
||||
print("操作已取消")
|
||||
sys.exit(1)
|
||||
print("\n" + "=" * 40 + "\n")
|
||||
|
||||
logger.info("----开始导入openie数据----\n")
|
||||
|
||||
logger.info("创建LLM客户端")
|
||||
llm_client_list = dict()
|
||||
for key in global_config["llm_providers"]:
|
||||
llm_client_list[key] = LLMClient(
|
||||
global_config["llm_providers"][key]["base_url"],
|
||||
global_config["llm_providers"][key]["api_key"],
|
||||
)
|
||||
|
||||
# 初始化Embedding库
|
||||
embed_manager = embed_manager = EmbeddingManager(llm_client_list[global_config["embedding"]["provider"]])
|
||||
logger.info("正在从文件加载Embedding库")
|
||||
try:
|
||||
embed_manager.load_from_file()
|
||||
except Exception as e:
|
||||
logger.error("从文件加载Embedding库时发生错误:{}".format(e))
|
||||
logger.info("Embedding库加载完成")
|
||||
# 初始化KG
|
||||
kg_manager = KGManager()
|
||||
logger.info("正在从文件加载KG")
|
||||
try:
|
||||
kg_manager.load_from_file()
|
||||
except Exception as e:
|
||||
logger.error("从文件加载KG时发生错误:{}".format(e))
|
||||
logger.info("KG加载完成")
|
||||
|
||||
logger.info(f"KG节点数量:{len(kg_manager.graph.get_node_list())}")
|
||||
logger.info(f"KG边数量:{len(kg_manager.graph.get_edge_list())}")
|
||||
|
||||
# 数据比对:Embedding库与KG的段落hash集合
|
||||
for pg_hash in kg_manager.stored_paragraph_hashes:
|
||||
key = PG_NAMESPACE + "-" + pg_hash
|
||||
if key not in embed_manager.stored_pg_hashes:
|
||||
logger.warning(f"KG中存在Embedding库中不存在的段落:{key}")
|
||||
|
||||
logger.info("正在导入OpenIE数据文件")
|
||||
try:
|
||||
openie_data = OpenIE.load()
|
||||
except Exception as e:
|
||||
logger.error("导入OpenIE数据文件时发生错误:{}".format(e))
|
||||
return False
|
||||
if handle_import_openie(openie_data, embed_manager, kg_manager) is False:
|
||||
logger.error("处理OpenIE数据时发生错误")
|
||||
return False
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
177
scripts/info_extraction.py
Normal file
177
scripts/info_extraction.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from threading import Lock, Event
|
||||
import sys
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
# 添加项目根目录到 sys.path
|
||||
|
||||
import tqdm
|
||||
|
||||
from src.common.logger import get_module_logger
|
||||
from src.plugins.knowledge.src.lpmmconfig import global_config
|
||||
from src.plugins.knowledge.src.ie_process import info_extract_from_str
|
||||
from src.plugins.knowledge.src.llm_client import LLMClient
|
||||
from src.plugins.knowledge.src.open_ie import OpenIE
|
||||
from src.plugins.knowledge.src.raw_processing import load_raw_data
|
||||
|
||||
logger = get_module_logger("LPMM知识库-信息提取")
|
||||
|
||||
TEMP_DIR = "./temp"
|
||||
|
||||
# 创建一个线程安全的锁,用于保护文件操作和共享数据
|
||||
file_lock = Lock()
|
||||
open_ie_doc_lock = Lock()
|
||||
|
||||
# 创建一个事件标志,用于控制程序终止
|
||||
shutdown_event = Event()
|
||||
|
||||
|
||||
def process_single_text(pg_hash, raw_data, llm_client_list):
|
||||
"""处理单个文本的函数,用于线程池"""
|
||||
temp_file_path = f"{TEMP_DIR}/{pg_hash}.json"
|
||||
|
||||
# 使用文件锁检查和读取缓存文件
|
||||
with file_lock:
|
||||
if os.path.exists(temp_file_path):
|
||||
try:
|
||||
# 存在对应的提取结果
|
||||
logger.info(f"找到缓存的提取结果:{pg_hash}")
|
||||
with open(temp_file_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f), None
|
||||
except json.JSONDecodeError:
|
||||
# 如果JSON文件损坏,删除它并重新处理
|
||||
logger.warning(f"缓存文件损坏,重新处理:{pg_hash}")
|
||||
os.remove(temp_file_path)
|
||||
|
||||
entity_list, rdf_triple_list = info_extract_from_str(
|
||||
llm_client_list[global_config["entity_extract"]["llm"]["provider"]],
|
||||
llm_client_list[global_config["rdf_build"]["llm"]["provider"]],
|
||||
raw_data,
|
||||
)
|
||||
if entity_list is None or rdf_triple_list is None:
|
||||
return None, pg_hash
|
||||
else:
|
||||
doc_item = {
|
||||
"idx": pg_hash,
|
||||
"passage": raw_data,
|
||||
"extracted_entities": entity_list,
|
||||
"extracted_triples": rdf_triple_list,
|
||||
}
|
||||
# 保存临时提取结果
|
||||
with file_lock:
|
||||
try:
|
||||
with open(temp_file_path, "w", encoding="utf-8") as f:
|
||||
json.dump(doc_item, f, ensure_ascii=False, indent=4)
|
||||
except Exception as e:
|
||||
logger.error(f"保存缓存文件失败:{pg_hash}, 错误:{e}")
|
||||
# 如果保存失败,确保不会留下损坏的文件
|
||||
if os.path.exists(temp_file_path):
|
||||
os.remove(temp_file_path)
|
||||
# 设置shutdown_event以终止程序
|
||||
shutdown_event.set()
|
||||
return None, pg_hash
|
||||
return doc_item, None
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""处理Ctrl+C信号"""
|
||||
logger.info("\n接收到中断信号,正在优雅地关闭程序...")
|
||||
shutdown_event.set()
|
||||
|
||||
|
||||
def main():
|
||||
# 设置信号处理器
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# 新增用户确认提示
|
||||
print("=== 重要操作确认 ===")
|
||||
print("实体提取操作将会花费较多资金和时间,建议在空闲时段执行。")
|
||||
print("举例:600万字全剧情,提取选用deepseek v3 0324,消耗约40元,约3小时。")
|
||||
print("建议使用硅基流动的非Pro模型")
|
||||
print("或者使用可以用赠金抵扣的Pro模型")
|
||||
print("请确保账户余额充足,并且在执行前确认无误。")
|
||||
confirm = input("确认继续执行?(y/n): ").strip().lower()
|
||||
if confirm != "y":
|
||||
logger.info("用户取消操作")
|
||||
print("操作已取消")
|
||||
sys.exit(1)
|
||||
print("\n" + "=" * 40 + "\n")
|
||||
|
||||
logger.info("--------进行信息提取--------\n")
|
||||
|
||||
logger.info("创建LLM客户端")
|
||||
llm_client_list = dict()
|
||||
for key in global_config["llm_providers"]:
|
||||
llm_client_list[key] = LLMClient(
|
||||
global_config["llm_providers"][key]["base_url"],
|
||||
global_config["llm_providers"][key]["api_key"],
|
||||
)
|
||||
|
||||
logger.info("正在加载原始数据")
|
||||
sha256_list, raw_datas = load_raw_data()
|
||||
logger.info("原始数据加载完成\n")
|
||||
|
||||
# 创建临时目录
|
||||
if not os.path.exists(f"{TEMP_DIR}"):
|
||||
os.makedirs(f"{TEMP_DIR}")
|
||||
|
||||
failed_sha256 = []
|
||||
open_ie_doc = []
|
||||
|
||||
# 创建线程池,最大线程数为50
|
||||
workers = global_config["info_extraction"]["workers"]
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
# 提交所有任务到线程池
|
||||
future_to_hash = {
|
||||
executor.submit(process_single_text, pg_hash, raw_data, llm_client_list): pg_hash
|
||||
for pg_hash, raw_data in zip(sha256_list, raw_datas)
|
||||
}
|
||||
|
||||
# 使用tqdm显示进度
|
||||
with tqdm.tqdm(total=len(future_to_hash), postfix="正在进行提取:") as pbar:
|
||||
# 处理完成的任务
|
||||
try:
|
||||
for future in as_completed(future_to_hash):
|
||||
if shutdown_event.is_set():
|
||||
# 取消所有未完成的任务
|
||||
for f in future_to_hash:
|
||||
if not f.done():
|
||||
f.cancel()
|
||||
break
|
||||
|
||||
doc_item, failed_hash = future.result()
|
||||
if failed_hash:
|
||||
failed_sha256.append(failed_hash)
|
||||
logger.error(f"提取失败:{failed_hash}")
|
||||
elif doc_item:
|
||||
with open_ie_doc_lock:
|
||||
open_ie_doc.append(doc_item)
|
||||
pbar.update(1)
|
||||
except KeyboardInterrupt:
|
||||
# 如果在这里捕获到KeyboardInterrupt,说明signal_handler可能没有正常工作
|
||||
logger.info("\n接收到中断信号,正在优雅地关闭程序...")
|
||||
shutdown_event.set()
|
||||
# 取消所有未完成的任务
|
||||
for f in future_to_hash:
|
||||
if not f.done():
|
||||
f.cancel()
|
||||
|
||||
# 保存信息提取结果
|
||||
sum_phrase_chars = sum([len(e) for chunk in open_ie_doc for e in chunk["extracted_entities"]])
|
||||
sum_phrase_words = sum([len(e.split()) for chunk in open_ie_doc for e in chunk["extracted_entities"]])
|
||||
num_phrases = sum([len(chunk["extracted_entities"]) for chunk in open_ie_doc])
|
||||
openie_obj = OpenIE(
|
||||
open_ie_doc,
|
||||
round(sum_phrase_chars / num_phrases, 4),
|
||||
round(sum_phrase_words / num_phrases, 4),
|
||||
)
|
||||
OpenIE.save(openie_obj)
|
||||
|
||||
logger.info("--------信息提取完成--------")
|
||||
logger.info(f"提取失败的文段SHA256:{failed_sha256}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
91
scripts/raw_data_preprocessor.py
Normal file
91
scripts/raw_data_preprocessor.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys # 新增系统模块导入
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
from src.common.logger import get_module_logger
|
||||
|
||||
logger = get_module_logger("LPMM数据库-原始数据处理")
|
||||
|
||||
# 添加项目根目录到 sys.path
|
||||
|
||||
|
||||
def check_and_create_dirs():
|
||||
"""检查并创建必要的目录"""
|
||||
required_dirs = ["data/lpmm_raw_data", "data/imported_lpmm_data"]
|
||||
|
||||
for dir_path in required_dirs:
|
||||
if not os.path.exists(dir_path):
|
||||
os.makedirs(dir_path)
|
||||
logger.info(f"已创建目录: {dir_path}")
|
||||
|
||||
|
||||
def process_text_file(file_path):
|
||||
"""处理单个文本文件,返回段落列表"""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
raw = f.read()
|
||||
|
||||
paragraphs = []
|
||||
paragraph = ""
|
||||
for line in raw.split("\n"):
|
||||
if line.strip() == "":
|
||||
if paragraph != "":
|
||||
paragraphs.append(paragraph.strip())
|
||||
paragraph = ""
|
||||
else:
|
||||
paragraph += line + "\n"
|
||||
|
||||
if paragraph != "":
|
||||
paragraphs.append(paragraph.strip())
|
||||
|
||||
return paragraphs
|
||||
|
||||
|
||||
def main():
|
||||
# 新增用户确认提示
|
||||
print("=== 重要操作确认 ===")
|
||||
print("如果你并非第一次导入知识")
|
||||
print("请先删除data/import.json文件,备份data/openie.json文件")
|
||||
print("在进行知识库导入之前")
|
||||
print("请修改config/lpmm_config.toml中的配置项")
|
||||
confirm = input("确认继续执行?(y/n): ").strip().lower()
|
||||
if confirm != "y":
|
||||
logger.error("操作已取消")
|
||||
sys.exit(1)
|
||||
print("\n" + "=" * 40 + "\n")
|
||||
|
||||
# 检查并创建必要的目录
|
||||
check_and_create_dirs()
|
||||
|
||||
# 检查输出文件是否存在
|
||||
if os.path.exists("data/import.json"):
|
||||
logger.error("错误: data/import.json 已存在,请先处理或删除该文件")
|
||||
sys.exit(1)
|
||||
|
||||
if os.path.exists("data/openie.json"):
|
||||
logger.error("错误: data/openie.json 已存在,请先处理或删除该文件")
|
||||
sys.exit(1)
|
||||
|
||||
# 获取所有原始文本文件
|
||||
raw_files = list(Path("data/lpmm_raw_data").glob("*.txt"))
|
||||
if not raw_files:
|
||||
logger.warning("警告: data/lpmm_raw_data 中没有找到任何 .txt 文件")
|
||||
sys.exit(1)
|
||||
|
||||
# 处理所有文件
|
||||
all_paragraphs = []
|
||||
for file in raw_files:
|
||||
logger.info(f"正在处理文件: {file.name}")
|
||||
paragraphs = process_text_file(file)
|
||||
all_paragraphs.extend(paragraphs)
|
||||
|
||||
# 保存合并后的结果
|
||||
output_path = "data/import.json"
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(all_paragraphs, f, ensure_ascii=False, indent=4)
|
||||
|
||||
logger.info(f"处理完成,结果已保存到: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
scripts/tempCodeRunnerFile.py
Normal file
1
scripts/tempCodeRunnerFile.py
Normal file
@@ -0,0 +1 @@
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
@@ -1,3 +1,9 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 添加项目根目录到 sys.path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from src.plugins.models.utils_model import LLMRequest
|
||||
@@ -64,12 +70,6 @@ async def test_with_tool_calls():
|
||||
prompt = "请分析当前天气情况,并查询今日历史上的重要事件。并且3.9和3.11谁比较大?请使用适当的工具来获取这些信息。"
|
||||
prompt = """
|
||||
你的名字是麦麦,你包容开放,情绪敏感,有时候有些搞怪幽默, 是一个学习心理学和脑科学的女大学生,现在在读大二,你会刷贴吧,有时候会喜欢说一些奇怪的话,喜欢刷小红书
|
||||
刚刚你的内心想法是:漂移菌提到罐罐被吃完了,可以顺着这个梗继续玩一下,比如假装委屈"那今晚的加班费是不是也要被吃掉了"或者"猫娘罢工警告"。不过薯薯和薯宝之前已经接了不少梗,漂移菌刚刚也参与了,可能话题热度还在,可以再互动一下。如果没人接话,或许可以问问大家有没有遇到过类似"代码写完但奖励被吃掉"的搞笑职场经历,换个轻松的话题方向。
|
||||
|
||||
暂时不需要使用工具。
|
||||
-----------------------------------
|
||||
现在是2025-04-25 17:38:37,你正在上网,和qq群里的网友们聊天,以下是正在进行的聊天内容:
|
||||
2025-04-25 17:34:08麦麦(你) 说:[表达了:顽皮、嬉戏。];
|
||||
2025-04-25 17:34:39漂移菌 说:@麦麦。(id:3936257206) 你是一只猫娘;
|
||||
2025-04-25 17:34:42薯宝 说:🤣;
|
||||
2025-04-25 17:34:43麦麦(你) 说:行啊 工资分我一半;
|
||||
|
||||
Reference in New Issue
Block a user