From 81e5c1bb8bdc14b3e42b5fb84b1a49bb245409d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A2=A8=E6=A2=93=E6=9F=92?= <1787882683@qq.com>
Date: Fri, 2 May 2025 15:45:42 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BAOpenIE=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=AE=8C=E6=95=B4=E6=80=A7=E6=A3=80=E6=9F=A5=EF=BC=8C?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96=E9=94=99=E8=AF=AF=E6=97=A5=E5=BF=97=E8=BE=93?=
 =?UTF-8?q?=E5=87=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/import_openie.py | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/scripts/import_openie.py b/scripts/import_openie.py
index dd4b50ece..2a6e09b73 100644
--- a/scripts/import_openie.py
+++ b/scripts/import_openie.py
@@ -6,6 +6,7 @@
 
 import sys
 import os
+from time import sleep
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 
@@ -26,7 +27,7 @@ OPENIE_DIR = (
     else os.path.join(ROOT_PATH, "data/openie")
 )
 
-logger = get_module_logger("LPMM知识库-OpenIE导入")
+logger = get_module_logger("OpenIE导入")
 
 
 def hash_deduplicate(
@@ -71,8 +72,45 @@ def handle_import_openie(openie_data: OpenIE, embed_manager: EmbeddingManager, k
     entity_list_data = openie_data.extract_entity_dict()
     # 索引的三元组列表
     triple_list_data = openie_data.extract_triple_dict()
+    # print(openie_data.docs)
     if len(raw_paragraphs) != len(entity_list_data) or len(raw_paragraphs) != len(triple_list_data):
         logger.error("OpenIE数据存在异常")
+        logger.error(f"原始段落数量：{len(raw_paragraphs)}")
+        logger.error(f"实体列表数量：{len(entity_list_data)}")
+        logger.error(f"三元组列表数量：{len(triple_list_data)}")
+        logger.error("OpenIE数据段落数量与实体列表数量或三元组列表数量不一致")
+        logger.error("请保证你的原始数据分段良好，不要有类似于 “.....” 单独成一段的情况")
+        logger.error("或者一段中只有符号的情况")
+        # 新增：检查docs中每条数据的完整性
+        logger.error("系统将于2秒后开始检查数据完整性")
+        sleep(2)
+        found_missing = False
+        for doc in getattr(openie_data, "docs", []):
+            idx = doc.get("idx", "<无idx>")
+            passage = doc.get("passage", "<无passage>")
+            missing = []
+            # 检查字段是否存在且非空
+            if "passage" not in doc or not doc.get("passage"):
+                missing.append("passage")
+            if "extracted_entities" not in doc or not isinstance(doc.get("extracted_entities"), list):
+                missing.append("名词列表缺失")
+            elif len(doc.get("extracted_entities", [])) == 0:
+                missing.append("名词列表为空")
+            if "extracted_triples" not in doc or not isinstance(doc.get("extracted_triples"), list):
+                missing.append("主谓宾三元组缺失")
+            elif len(doc.get("extracted_triples", [])) == 0:
+                missing.append("主谓宾三元组为空")
+            # 输出所有doc的idx
+            # print(f"检查: idx={idx}")
+            if missing:
+                found_missing = True
+                logger.error("\n")
+                logger.error("数据缺失：")
+                logger.error(f"对应哈希值：{idx}")
+                logger.error(f"对应文段内容内容：{passage}")
+                logger.error(f"非法原因：{', '.join(missing)}")
+        if not found_missing:
+            print("所有数据均完整，没有发现缺失字段。")
         return False
     # 将索引换为对应段落的hash值
     logger.info("正在进行段落去重与重索引")