From 5fafa2a8924c686721437963a26e5b84f9d03c5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A2=A8=E6=A2=93=E6=9F=92?= <1787882683@qq.com>
Date: Mon, 5 May 2025 21:27:11 +0800
Subject: [PATCH 1/5] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=AF=BC=E5=85=A5=E5=A4=84=E7=90=86=EF=BC=8C=E6=96=B0?=
 =?UTF-8?q?=E5=A2=9E=E9=9D=9E=E6=B3=95=E6=96=87=E6=AE=B5=E6=A3=80=E6=B5=8B?=
 =?UTF-8?q?=E4=B8=8E=E7=94=A8=E6=88=B7=E7=A1=AE=E8=AE=A4=E5=88=A0=E9=99=A4?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD=EF=BC=9B=E4=BC=98=E5=8C=96=E5=8E=9F=E5=A7=8B?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=B7=AF=E5=BE=84=E5=88=9B=E5=BB=BA=E4=B8=8E?=
 =?UTF-8?q?=E6=97=A5=E5=BF=97=E8=AE=B0=E5=BD=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/import_openie.py         | 27 +++++++++++++++++++++++++--
 scripts/raw_data_preprocessor.py | 21 +++++++++++++++------
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/scripts/import_openie.py b/scripts/import_openie.py
index 851cc8b31..eae0683db 100644
--- a/scripts/import_openie.py
+++ b/scripts/import_openie.py
@@ -85,6 +85,7 @@ def handle_import_openie(openie_data: OpenIE, embed_manager: EmbeddingManager, k
         logger.error("系统将于2秒后开始检查数据完整性")
         sleep(2)
         found_missing = False
+        missing_idxs = []
         for doc in getattr(openie_data, "docs", []):
             idx = doc.get("idx", "<无idx>")
             passage = doc.get("passage", "<无passage>")
@@ -104,14 +105,36 @@ def handle_import_openie(openie_data: OpenIE, embed_manager: EmbeddingManager, k
             # print(f"检查: idx={idx}")
             if missing:
                 found_missing = True
+                missing_idxs.append(idx)
                 logger.error("\n")
                 logger.error("数据缺失：")
                 logger.error(f"对应哈希值：{idx}")
                 logger.error(f"对应文段内容内容：{passage}")
                 logger.error(f"非法原因：{', '.join(missing)}")
+        # 确保提示在所有非法数据输出后再输出
         if not found_missing:
-            print("所有数据均完整，没有发现缺失字段。")
-        return False
+            logger.info("所有数据均完整，没有发现缺失字段。")
+            return False
+        # 新增：提示用户是否删除非法文段继续导入
+        # 将print移到所有logger.error之后，确保不会被冲掉
+        logger.info("\n检测到非法文段，共{}条。".format(len(missing_idxs)))
+        logger.info("\n是否删除所有非法文段后继续导入？(y/n): ", end="")
+        user_choice = input().strip().lower()
+        if user_choice != "y":
+            logger.info("用户选择不删除非法文段，程序终止。")
+            sys.exit(1)
+        # 删除非法文段
+        logger.info("正在删除非法文段并继续导入...")
+        # 过滤掉非法文段
+        openie_data.docs = [doc for doc in getattr(openie_data, "docs", []) if doc.get("idx", "<无idx>") not in missing_idxs]
+        # 重新提取数据
+        raw_paragraphs = openie_data.extract_raw_paragraph_dict()
+        entity_list_data = openie_data.extract_entity_dict()
+        triple_list_data = openie_data.extract_triple_dict()
+        # 再次校验
+        if len(raw_paragraphs) != len(entity_list_data) or len(raw_paragraphs) != len(triple_list_data):
+            logger.error("删除非法文段后，数据仍不一致，程序终止。")
+            sys.exit(1)
     # 将索引换为对应段落的hash值
     logger.info("正在进行段落去重与重索引")
     raw_paragraphs, triple_list_data = hash_deduplicate(
diff --git a/scripts/raw_data_preprocessor.py b/scripts/raw_data_preprocessor.py
index c87c30ca8..33fdede9e 100644
--- a/scripts/raw_data_preprocessor.py
+++ b/scripts/raw_data_preprocessor.py
@@ -5,12 +5,21 @@ import sys  # 新增系统模块导入
 import datetime  # 新增导入
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from src.common.logger import get_module_logger
+from src.common.logger_manager import get_logger
+from src.plugins.knowledge.src.lpmmconfig import global_config
 
-logger = get_module_logger("LPMM数据库-原始数据处理")
+logger = get_logger("lpmm")
 ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 RAW_DATA_PATH = os.path.join(ROOT_PATH, "data/lpmm_raw_data")
-IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data/imported_lpmm_data")
+# 新增：确保 RAW_DATA_PATH 存在
+if not os.path.exists(RAW_DATA_PATH):
+    os.makedirs(RAW_DATA_PATH, exist_ok=True)
+    logger.info(f"已创建目录: {RAW_DATA_PATH}")
+
+if global_config.get("persistence", {}).get("raw_data_path") is not None:
+    IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, global_config["persistence"]["raw_data_path"])
+else:
+    IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data/imported_lpmm_data")
 
 # 添加项目根目录到 sys.path
 
@@ -54,7 +63,7 @@ def main():
     print("请确保原始数据已放置在正确的目录中。")
     confirm = input("确认继续执行？(y/n): ").strip().lower()
     if confirm != "y":
-        logger.error("操作已取消")
+        logger.info("操作已取消")
         sys.exit(1)
     print("\n" + "=" * 40 + "\n")
 
@@ -94,6 +103,6 @@ def main():
 
 
 if __name__ == "__main__":
-    print(f"Raw Data Path: {RAW_DATA_PATH}")
-    print(f"Imported Data Path: {IMPORTED_DATA_PATH}")
+    logger.info(f"原始数据路径: {RAW_DATA_PATH}")
+    logger.info(f"处理后的数据路径: {IMPORTED_DATA_PATH}")
     main()

From e1c1b0ee2e0a5e3f50bf8f36728798190aefafd2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 5 May 2025 13:27:28 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=A4=96=20=E8=87=AA=E5=8A=A8=E6=A0=BC?=
 =?UTF-8?q?=E5=BC=8F=E5=8C=96=E4=BB=A3=E7=A0=81=20[skip=20ci]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/import_openie.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/import_openie.py b/scripts/import_openie.py
index eae0683db..f81cbb5c7 100644
--- a/scripts/import_openie.py
+++ b/scripts/import_openie.py
@@ -126,7 +126,9 @@ def handle_import_openie(openie_data: OpenIE, embed_manager: EmbeddingManager, k
         # 删除非法文段
         logger.info("正在删除非法文段并继续导入...")
         # 过滤掉非法文段
-        openie_data.docs = [doc for doc in getattr(openie_data, "docs", []) if doc.get("idx", "<无idx>") not in missing_idxs]
+        openie_data.docs = [
+            doc for doc in getattr(openie_data, "docs", []) if doc.get("idx", "<无idx>") not in missing_idxs
+        ]
         # 重新提取数据
         raw_paragraphs = openie_data.extract_raw_paragraph_dict()
         entity_list_data = openie_data.extract_entity_dict()

From 0147f49ee9ce05250173fa2ebb0fe7b030499e7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A2=A8=E6=A2=93=E6=9F=92?= <1787882683@qq.com>
Date: Mon, 5 May 2025 21:42:45 +0800
Subject: [PATCH 3/5] =?UTF-8?q?fix:=20=E8=B0=83=E6=95=B4lpmm=20template?=
 =?UTF-8?q?=E4=B8=AD=E5=AE=9E=E4=BD=93=E6=8F=90=E5=8F=96=E7=BA=BF=E7=A8=8B?=
 =?UTF-8?q?=E6=95=B0=EF=BC=8C=E9=9D=9EPro=E6=A8=A1=E5=9E=8B=E8=AE=BE?=
 =?UTF-8?q?=E7=BD=AE=E4=B8=BA3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 template/lpmm_config_template.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/template/lpmm_config_template.toml b/template/lpmm_config_template.toml
index aae664d51..745cbaaf6 100644
--- a/template/lpmm_config_template.toml
+++ b/template/lpmm_config_template.toml
@@ -38,8 +38,8 @@ synonym_threshold = 0.8   # 同义词阈值（相似度高于此阈值的词语
 provider = "siliconflow"                 # 服务提供商
 model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" # 模型名称
 
-[info_extraction]
-workers = 10
+[info_extraction] 
+workers = 3            # 实体提取同时执行线程数，非Pro模型不要设置超过5
 
 [qa.params]
 # QA参数配置

From f5894e01539520744017c1499b8c01714d23feb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A2=A8=E6=A2=93=E6=9F=92?= <1787882683@qq.com>
Date: Mon, 5 May 2025 22:04:50 +0800
Subject: [PATCH 4/5] =?UTF-8?q?fix:=20=E4=BC=98=E5=8C=96=E5=B5=8C=E5=85=A5?=
 =?UTF-8?q?=E5=BA=93=E5=8A=A0=E8=BD=BD=E8=BF=87=E7=A8=8B=EF=BC=8C=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E8=BF=9B=E5=BA=A6=E6=9D=A1=E6=98=BE=E7=A4=BA=EF=BC=9B?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=A6=96=E6=AC=A1=E5=AF=BC=E5=85=A5=E7=9F=A5?=
 =?UTF-8?q?=E8=AF=86=E6=97=B6=E7=9A=84=E9=94=99=E8=AF=AF=E6=8F=90=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/import_openie.py                     |  3 ++-
 src/plugins/knowledge/src/embedding_store.py | 22 ++++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/scripts/import_openie.py b/scripts/import_openie.py
index f81cbb5c7..472667c14 100644
--- a/scripts/import_openie.py
+++ b/scripts/import_openie.py
@@ -204,7 +204,8 @@ def main():
             logger.error("请保证你的嵌入模型从未更改,并且在导入时使用相同的模型")
             # print("检测到嵌入模型与本地存储不一致，已终止导入。请检查模型设置或清空嵌入库后重试。")
             sys.exit(1)
-        logger.error("如果你是第一次导入知识，请忽略此错误")
+        if "不存在" in str(e):
+            logger.error("如果你是第一次导入知识，请忽略此错误")
     logger.info("Embedding库加载完成")
     # 初始化KG
     kg_manager = KGManager()
diff --git a/src/plugins/knowledge/src/embedding_store.py b/src/plugins/knowledge/src/embedding_store.py
index 5ee92a869..7d012b19b 100644
--- a/src/plugins/knowledge/src/embedding_store.py
+++ b/src/plugins/knowledge/src/embedding_store.py
@@ -6,7 +6,7 @@ from typing import Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
-import tqdm
+# import tqdm
 import faiss
 
 from .llm_client import LLMClient
@@ -194,11 +194,25 @@ class EmbeddingStore:
         """从文件中加载"""
         if not os.path.exists(self.embedding_file_path):
             raise Exception(f"文件{self.embedding_file_path}不存在")
-
         logger.info(f"正在从文件{self.embedding_file_path}中加载{self.namespace}嵌入库")
         data_frame = pd.read_parquet(self.embedding_file_path, engine="pyarrow")
-        for _, row in tqdm.tqdm(data_frame.iterrows(), total=len(data_frame)):
-            self.store[row["hash"]] = EmbeddingStoreItem(row["hash"], row["embedding"], row["str"])
+        total = len(data_frame)
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            MofNCompleteColumn(),
+            "•",
+            TimeElapsedColumn(),
+            "<",
+            TimeRemainingColumn(),
+            transient=False,
+        ) as progress:
+            task = progress.add_task("加载嵌入库", total=total)
+            for _, row in data_frame.iterrows():
+                self.store[row["hash"]] = EmbeddingStoreItem(row["hash"], row["embedding"], row["str"])
+                progress.update(task, advance=1)
         logger.info(f"{self.namespace}嵌入库加载成功")
 
         try:

From 5f8389fa01931b8b539296d68771bdbf589acd5d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 5 May 2025 14:05:05 +0000
Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=A4=96=20=E8=87=AA=E5=8A=A8=E6=A0=BC?=
 =?UTF-8?q?=E5=BC=8F=E5=8C=96=E4=BB=A3=E7=A0=81=20[skip=20ci]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/plugins/knowledge/src/embedding_store.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plugins/knowledge/src/embedding_store.py b/src/plugins/knowledge/src/embedding_store.py
index 7d012b19b..d1eb7f90f 100644
--- a/src/plugins/knowledge/src/embedding_store.py
+++ b/src/plugins/knowledge/src/embedding_store.py
@@ -6,6 +6,7 @@ from typing import Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
+
 # import tqdm
 import faiss