fix(embedding): 彻底解决事件循环冲突导致的嵌入生成异常

通过以下改动修复嵌入生成过程中的事件循环相关问题： - 在 EmbeddingStore._get_embedding 中，改为同步创建-使用-销毁的新事件循环模式，彻底避免嵌套事件循环问题 - 调整批量嵌入 _get_embeddings_batch_threaded，确保每个线程使用独立、短生命周期的事件循环 - 新增 force_new 参数，LLM 请求嵌入任务时强制创建新的客户端实例，减少跨循环对象复用 - 在 OpenAI 客户端的 embedding 调用处补充详细日志，方便排查网络连接异常 - get_embedding() 每次都重建 LLMRequest，降低实例在多个事件循环中穿梭的概率此次改动虽然以同步风格“硬掰”异步接口，但对现有接口零破坏，确保了向量数据库及相关知识检索功能的稳定性。(还有就是把的脚本文件夹移回来了)
2025-08-19 20:41:00 +08:00
parent f3b5836eee
commit 3bef6f4bab
16 changed files with 4695 additions and 23 deletions
--- a/scripts/expression_stats.py
+++ b/scripts/expression_stats.py
@@ -0,0 +1,208 @@
+import time
+import sys
+import os
+from typing import Dict, List
+
+# Add project root to Python path
+from src.common.database.database_model import Expression, ChatStreams
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+
+
+
+
+def get_chat_name(chat_id: str) -> str:
+    """Get chat name from chat_id by querying ChatStreams table directly"""
+    try:
+        # 直接从数据库查询ChatStreams表
+        chat_stream = ChatStreams.get_or_none(ChatStreams.stream_id == chat_id)
+        if chat_stream is None:
+            return f"未知聊天 ({chat_id})"
+        
+        # 如果有群组信息，显示群组名称
+        if chat_stream.group_name:
+            return f"{chat_stream.group_name} ({chat_id})"
+        # 如果是私聊，显示用户昵称
+        elif chat_stream.user_nickname:
+            return f"{chat_stream.user_nickname}的私聊 ({chat_id})"
+        else:
+            return f"未知聊天 ({chat_id})"
+    except Exception:
+        return f"查询失败 ({chat_id})"
+
+
+def calculate_time_distribution(expressions) -> Dict[str, int]:
+    """Calculate distribution of last active time in days"""
+    now = time.time()
+    distribution = {
+        '0-1天': 0,
+        '1-3天': 0,
+        '3-7天': 0,
+        '7-14天': 0,
+        '14-30天': 0,
+        '30-60天': 0,
+        '60-90天': 0,
+        '90+天': 0
+    }
+    for expr in expressions:
+        diff_days = (now - expr.last_active_time) / (24*3600)
+        if diff_days < 1:
+            distribution['0-1天'] += 1
+        elif diff_days < 3:
+            distribution['1-3天'] += 1
+        elif diff_days < 7:
+            distribution['3-7天'] += 1
+        elif diff_days < 14:
+            distribution['7-14天'] += 1
+        elif diff_days < 30:
+            distribution['14-30天'] += 1
+        elif diff_days < 60:
+            distribution['30-60天'] += 1
+        elif diff_days < 90:
+            distribution['60-90天'] += 1
+        else:
+            distribution['90+天'] += 1
+    return distribution
+
+
+def calculate_count_distribution(expressions) -> Dict[str, int]:
+    """Calculate distribution of count values"""
+    distribution = {
+        '0-1': 0,
+        '1-2': 0,
+        '2-3': 0,
+        '3-4': 0,
+        '4-5': 0,
+        '5-10': 0,
+        '10+': 0
+    }
+    for expr in expressions:
+        cnt = expr.count
+        if cnt < 1:
+            distribution['0-1'] += 1
+        elif cnt < 2:
+            distribution['1-2'] += 1
+        elif cnt < 3:
+            distribution['2-3'] += 1
+        elif cnt < 4:
+            distribution['3-4'] += 1
+        elif cnt < 5:
+            distribution['4-5'] += 1
+        elif cnt < 10:
+            distribution['5-10'] += 1
+        else:
+            distribution['10+'] += 1
+    return distribution
+
+
+def get_top_expressions_by_chat(chat_id: str, top_n: int = 5) -> List[Expression]:
+    """Get top N most used expressions for a specific chat_id"""
+    return (Expression.select()
+                    .where(Expression.chat_id == chat_id)
+                    .order_by(Expression.count.desc())
+                    .limit(top_n))
+
+
+def show_overall_statistics(expressions, total: int) -> None:
+    """Show overall statistics"""
+    time_dist = calculate_time_distribution(expressions)
+    count_dist = calculate_count_distribution(expressions)
+    
+    print("\n=== 总体统计 ===")
+    print(f"总表达式数量: {total}")
+    
+    print("\n上次激活时间分布:")
+    for period, count in time_dist.items():
+        print(f"{period}: {count} ({count/total*100:.2f}%)")
+    
+    print("\ncount分布:")
+    for range_, count in count_dist.items():
+        print(f"{range_}: {count} ({count/total*100:.2f}%)")
+
+
+def show_chat_statistics(chat_id: str, chat_name: str) -> None:
+    """Show statistics for a specific chat"""
+    chat_exprs = list(Expression.select().where(Expression.chat_id == chat_id))
+    chat_total = len(chat_exprs)
+    
+    print(f"\n=== {chat_name} ===")
+    print(f"表达式数量: {chat_total}")
+    
+    if chat_total == 0:
+        print("该聊天没有表达式数据")
+        return
+    
+    # Time distribution for this chat
+    time_dist = calculate_time_distribution(chat_exprs)
+    print("\n上次激活时间分布:")
+    for period, count in time_dist.items():
+        if count > 0:
+            print(f"{period}: {count} ({count/chat_total*100:.2f}%)")
+    
+    # Count distribution for this chat
+    count_dist = calculate_count_distribution(chat_exprs)
+    print("\ncount分布:")
+    for range_, count in count_dist.items():
+        if count > 0:
+            print(f"{range_}: {count} ({count/chat_total*100:.2f}%)")
+    
+    # Top expressions
+    print("\nTop 10使用最多的表达式:")
+    top_exprs = get_top_expressions_by_chat(chat_id, 10)
+    for i, expr in enumerate(top_exprs, 1):
+        print(f"{i}. [{expr.type}] Count: {expr.count}")
+        print(f"   Situation: {expr.situation}")
+        print(f"   Style: {expr.style}")
+        print()
+
+
+def interactive_menu() -> None:
+    """Interactive menu for expression statistics"""
+    # Get all expressions
+    expressions = list(Expression.select())
+    if not expressions:
+        print("数据库中没有找到表达式")
+        return
+    
+    total = len(expressions)
+    
+    # Get unique chat_ids and their names
+    chat_ids = list(set(expr.chat_id for expr in expressions))
+    chat_info = [(chat_id, get_chat_name(chat_id)) for chat_id in chat_ids]
+    chat_info.sort(key=lambda x: x[1])  # Sort by chat name
+    
+    while True:
+        print("\n" + "="*50)
+        print("表达式统计分析")
+        print("="*50)
+        print("0. 显示总体统计")
+        
+        for i, (chat_id, chat_name) in enumerate(chat_info, 1):
+            chat_count = sum(1 for expr in expressions if expr.chat_id == chat_id)
+            print(f"{i}. {chat_name} ({chat_count}个表达式)")
+        
+        print("q. 退出")
+        
+        choice = input("\n请选择要查看的统计 (输入序号): ").strip()
+        
+        if choice.lower() == 'q':
+            print("再见！")
+            break
+        
+        try:
+            choice_num = int(choice)
+            if choice_num == 0:
+                show_overall_statistics(expressions, total)
+            elif 1 <= choice_num <= len(chat_info):
+                chat_id, chat_name = chat_info[choice_num - 1]
+                show_chat_statistics(chat_id, chat_name)
+            else:
+                print("无效的选择，请重新输入")
+        except ValueError:
+            print("请输入有效的数字")
+        
+        input("\n按回车键继续...")
+
+
+if __name__ == "__main__":
+    interactive_menu() 
--- a/scripts/import_openie.py
+++ b/scripts/import_openie.py
@@ -0,0 +1,268 @@
+# try:
+#     import src.plugins.knowledge.lib.quick_algo
+# except ImportError:
+#     print("未找到quick_algo库，无法使用quick_algo算法")
+#     print("请安装quick_algo库 - 在lib.quick_algo中，执行命令：python setup.py build_ext --inplace")
+
+import sys
+import os
+import asyncio
+from time import sleep
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from src.chat.knowledge.embedding_store import EmbeddingManager
+from src.chat.knowledge.open_ie import OpenIE
+from src.chat.knowledge.kg_manager import KGManager
+from src.common.logger import get_logger
+from src.chat.knowledge.utils.hash import get_sha256
+
+
+# 添加项目根目录到 sys.path
+ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+OPENIE_DIR = os.path.join(ROOT_PATH, "data", "openie")
+
+logger = get_logger("OpenIE导入")
+
+def ensure_openie_dir():
+    """确保OpenIE数据目录存在"""
+    if not os.path.exists(OPENIE_DIR):
+        os.makedirs(OPENIE_DIR)
+        logger.info(f"创建OpenIE数据目录：{OPENIE_DIR}")
+    else:
+        logger.info(f"OpenIE数据目录已存在：{OPENIE_DIR}")
+
+
+def hash_deduplicate(
+    raw_paragraphs: dict[str, str],
+    triple_list_data: dict[str, list[list[str]]],
+    stored_pg_hashes: set,
+    stored_paragraph_hashes: set,
+):
+    """Hash去重
+
+    Args:
+        raw_paragraphs: 索引的段落原文
+        triple_list_data: 索引的三元组列表
+        stored_pg_hashes: 已存储的段落hash集合
+        stored_paragraph_hashes: 已存储的段落hash集合
+
+    Returns:
+        new_raw_paragraphs: 去重后的段落
+        new_triple_list_data: 去重后的三元组
+    """
+    # 保存去重后的段落
+    new_raw_paragraphs = {}
+    # 保存去重后的三元组
+    new_triple_list_data = {}
+
+    for _, (raw_paragraph, triple_list) in enumerate(
+        zip(raw_paragraphs.values(), triple_list_data.values(), strict=False)
+    ):
+        # 段落hash
+        paragraph_hash = get_sha256(raw_paragraph)
+        # 使用与EmbeddingStore中一致的命名空间格式：namespace-hash
+        paragraph_key = f"paragraph-{paragraph_hash}"
+        if paragraph_key in stored_pg_hashes and paragraph_hash in stored_paragraph_hashes:
+            continue
+        new_raw_paragraphs[paragraph_hash] = raw_paragraph
+        new_triple_list_data[paragraph_hash] = triple_list
+
+    return new_raw_paragraphs, new_triple_list_data
+
+
+def handle_import_openie(openie_data: OpenIE, embed_manager: EmbeddingManager, kg_manager: KGManager) -> bool:
+    # sourcery skip: extract-method
+    # 从OpenIE数据中提取段落原文与三元组列表
+    # 索引的段落原文
+    raw_paragraphs = openie_data.extract_raw_paragraph_dict()
+    # 索引的实体列表
+    entity_list_data = openie_data.extract_entity_dict()
+    # 索引的三元组列表
+    triple_list_data = openie_data.extract_triple_dict()
+    # print(openie_data.docs)
+    if len(raw_paragraphs) != len(entity_list_data) or len(raw_paragraphs) != len(triple_list_data):
+        logger.error("OpenIE数据存在异常")
+        logger.error(f"原始段落数量：{len(raw_paragraphs)}")
+        logger.error(f"实体列表数量：{len(entity_list_data)}")
+        logger.error(f"三元组列表数量：{len(triple_list_data)}")
+        logger.error("OpenIE数据段落数量与实体列表数量或三元组列表数量不一致")
+        logger.error("请保证你的原始数据分段良好，不要有类似于 “.....” 单独成一段的情况")
+        logger.error("或者一段中只有符号的情况")
+        # 新增：检查docs中每条数据的完整性
+        logger.error("系统将于2秒后开始检查数据完整性")
+        sleep(2)
+        found_missing = False
+        missing_idxs = []
+        for doc in getattr(openie_data, "docs", []):
+            idx = doc.get("idx", "<无idx>")
+            passage = doc.get("passage", "<无passage>")
+            missing = []
+            # 检查字段是否存在且非空
+            if "passage" not in doc or not doc.get("passage"):
+                missing.append("passage")
+            if "extracted_entities" not in doc or not isinstance(doc.get("extracted_entities"), list):
+                missing.append("名词列表缺失")
+            elif len(doc.get("extracted_entities", [])) == 0:
+                missing.append("名词列表为空")
+            if "extracted_triples" not in doc or not isinstance(doc.get("extracted_triples"), list):
+                missing.append("主谓宾三元组缺失")
+            elif len(doc.get("extracted_triples", [])) == 0:
+                missing.append("主谓宾三元组为空")
+            # 输出所有doc的idx
+            # print(f"检查: idx={idx}")
+            if missing:
+                found_missing = True
+                missing_idxs.append(idx)
+                logger.error("\n")
+                logger.error("数据缺失：")
+                logger.error(f"对应哈希值：{idx}")
+                logger.error(f"对应文段内容内容：{passage}")
+                logger.error(f"非法原因：{', '.join(missing)}")
+        # 确保提示在所有非法数据输出后再输出
+        if not found_missing:
+            logger.info("所有数据均完整，没有发现缺失字段。")
+            return False
+        # 新增：提示用户是否删除非法文段继续导入
+        # 将print移到所有logger.error之后，确保不会被冲掉
+        logger.info(f"\n检测到非法文段，共{len(missing_idxs)}条。")
+        logger.info("\n是否删除所有非法文段后继续导入？(y/n): ", end="")
+        user_choice = input().strip().lower()
+        if user_choice != "y":
+            logger.info("用户选择不删除非法文段，程序终止。")
+            sys.exit(1)
+        # 删除非法文段
+        logger.info("正在删除非法文段并继续导入...")
+        # 过滤掉非法文段
+        openie_data.docs = [
+            doc for doc in getattr(openie_data, "docs", []) if doc.get("idx", "<无idx>") not in missing_idxs
+        ]
+        # 重新提取数据
+        raw_paragraphs = openie_data.extract_raw_paragraph_dict()
+        entity_list_data = openie_data.extract_entity_dict()
+        triple_list_data = openie_data.extract_triple_dict()
+    # 再次校验
+    if len(raw_paragraphs) != len(entity_list_data) or len(raw_paragraphs) != len(triple_list_data):
+        logger.error("删除非法文段后，数据仍不一致，程序终止。")
+        sys.exit(1)
+    # 将索引换为对应段落的hash值
+    logger.info("正在进行段落去重与重索引")
+    raw_paragraphs, triple_list_data = hash_deduplicate(
+        raw_paragraphs,
+        triple_list_data,
+        embed_manager.stored_pg_hashes,
+        kg_manager.stored_paragraph_hashes,
+    )
+    if len(raw_paragraphs) != 0:
+        # 获取嵌入并保存
+        logger.info(f"段落去重完成，剩余待处理的段落数量：{len(raw_paragraphs)}")
+        logger.info("开始Embedding")
+        embed_manager.store_new_data_set(raw_paragraphs, triple_list_data)
+        # Embedding-Faiss重索引
+        logger.info("正在重新构建向量索引")
+        embed_manager.rebuild_faiss_index()
+        logger.info("向量索引构建完成")
+        embed_manager.save_to_file()
+        logger.info("Embedding完成")
+        # 构建新段落的RAG
+        logger.info("开始构建RAG")
+        kg_manager.build_kg(triple_list_data, embed_manager)
+        kg_manager.save_to_file()
+        logger.info("RAG构建完成")
+    else:
+        logger.info("无新段落需要处理")
+    return True
+
+
+async def main_async():  # sourcery skip: dict-comprehension
+    # 新增确认提示
+    print("=== 重要操作确认 ===")
+    print("OpenIE导入时会大量发送请求，可能会撞到请求速度上限，请注意选用的模型")
+    print("同之前样例：在本地模型下，在70分钟内我们发送了约8万条请求，在网络允许下，速度会更快")
+    print("推荐使用硅基流动的Pro/BAAI/bge-m3")
+    print("每百万Token费用为0.7元")
+    print("知识导入时，会消耗大量系统资源，建议在较好配置电脑上运行")
+    print("同上样例，导入时10700K几乎跑满，14900HX占用80%，峰值内存占用约3G")
+    confirm = input("确认继续执行？(y/n): ").strip().lower()
+    if confirm != "y":
+        logger.info("用户取消操作")
+        print("操作已取消")
+        sys.exit(1)
+    print("\n" + "=" * 40 + "\n")
+    ensure_openie_dir()  # 确保OpenIE目录存在
+    logger.info("----开始导入openie数据----\n")
+
+    logger.info("创建LLM客户端")
+
+    # 初始化Embedding库
+    embed_manager = EmbeddingManager()
+    logger.info("正在从文件加载Embedding库")
+    try:
+        embed_manager.load_from_file()
+    except Exception as e:
+        logger.error(f"从文件加载Embedding库时发生错误：{e}")
+        if "嵌入模型与本地存储不一致" in str(e):
+            logger.error("检测到嵌入模型与本地存储不一致，已终止导入。请检查模型设置或清空嵌入库后重试。")
+            logger.error("请保证你的嵌入模型从未更改,并且在导入时使用相同的模型")
+            # print("检测到嵌入模型与本地存储不一致，已终止导入。请检查模型设置或清空嵌入库后重试。")
+            sys.exit(1)
+        if "不存在" in str(e):
+            logger.error("如果你是第一次导入知识，请忽略此错误")
+    logger.info("Embedding库加载完成")
+    # 初始化KG
+    kg_manager = KGManager()
+    logger.info("正在从文件加载KG")
+    try:
+        kg_manager.load_from_file()
+    except Exception as e:
+        logger.error(f"从文件加载KG时发生错误：{e}")
+        logger.error("如果你是第一次导入知识，请忽略此错误")
+    logger.info("KG加载完成")
+
+    logger.info(f"KG节点数量：{len(kg_manager.graph.get_node_list())}")
+    logger.info(f"KG边数量：{len(kg_manager.graph.get_edge_list())}")
+
+    # 数据比对：Embedding库与KG的段落hash集合
+    for pg_hash in kg_manager.stored_paragraph_hashes:
+        # 使用与EmbeddingStore中一致的命名空间格式：namespace-hash
+        key = f"paragraph-{pg_hash}"
+        if key not in embed_manager.stored_pg_hashes:
+            logger.warning(f"KG中存在Embedding库中不存在的段落：{key}")
+
+    logger.info("正在导入OpenIE数据文件")
+    try:
+        openie_data = OpenIE.load()
+    except Exception as e:
+        logger.error(f"导入OpenIE数据文件时发生错误：{e}")
+        return False
+    if handle_import_openie(openie_data, embed_manager, kg_manager) is False:
+        logger.error("处理OpenIE数据时发生错误")
+        return False
+    return None
+
+
+def main():
+    """主函数 - 设置新的事件循环并运行异步主函数"""
+    # 检查是否有现有的事件循环
+    try:
+        loop = asyncio.get_running_loop()
+        if loop.is_closed():
+            # 如果事件循环已关闭，创建新的
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+    except RuntimeError:
+        # 没有运行的事件循环，创建新的
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    
+    try:
+        # 在新的事件循环中运行异步主函数
+        loop.run_until_complete(main_async())
+    finally:
+        # 确保事件循环被正确关闭
+        if not loop.is_closed():
+            loop.close()
+
+
+if __name__ == "__main__":
+    # logger.info(f"111111111111111111111111{ROOT_PATH}")
+    main()
--- a/scripts/info_extraction.py
+++ b/scripts/info_extraction.py
@@ -0,0 +1,217 @@
+import json
+import os
+import signal
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock, Event
+import sys
+import datetime
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+# 添加项目根目录到 sys.path
+
+from rich.progress import Progress  # 替换为 rich 进度条
+
+from src.common.logger import get_logger
+# from src.chat.knowledge.lpmmconfig import global_config
+from src.chat.knowledge.ie_process import info_extract_from_str
+from src.chat.knowledge.open_ie import OpenIE
+from rich.progress import (
+    BarColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+    TaskProgressColumn,
+    MofNCompleteColumn,
+    SpinnerColumn,
+    TextColumn,
+)
+from raw_data_preprocessor import RAW_DATA_PATH, load_raw_data
+from src.config.config import global_config, model_config
+from src.llm_models.utils_model import LLMRequest
+
+logger = get_logger("LPMM知识库-信息提取")
+
+
+ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+TEMP_DIR = os.path.join(ROOT_PATH, "temp")
+# IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data", "imported_lpmm_data")
+OPENIE_OUTPUT_DIR = os.path.join(ROOT_PATH, "data", "openie")
+
+def ensure_dirs():
+    """确保临时目录和输出目录存在"""
+    if not os.path.exists(TEMP_DIR):
+        os.makedirs(TEMP_DIR)
+        logger.info(f"已创建临时目录: {TEMP_DIR}")
+    if not os.path.exists(OPENIE_OUTPUT_DIR):
+        os.makedirs(OPENIE_OUTPUT_DIR)
+        logger.info(f"已创建输出目录: {OPENIE_OUTPUT_DIR}")
+    if not os.path.exists(RAW_DATA_PATH):
+        os.makedirs(RAW_DATA_PATH)
+        logger.info(f"已创建原始数据目录: {RAW_DATA_PATH}")
+
+# 创建一个线程安全的锁，用于保护文件操作和共享数据
+file_lock = Lock()
+open_ie_doc_lock = Lock()
+
+# 创建一个事件标志，用于控制程序终止
+shutdown_event = Event()
+
+lpmm_entity_extract_llm = LLMRequest(
+    model_set=model_config.model_task_config.lpmm_entity_extract,
+    request_type="lpmm.entity_extract"
+)
+lpmm_rdf_build_llm = LLMRequest(
+    model_set=model_config.model_task_config.lpmm_rdf_build,
+    request_type="lpmm.rdf_build"
+)
+def process_single_text(pg_hash, raw_data):
+    """处理单个文本的函数，用于线程池"""
+    temp_file_path = f"{TEMP_DIR}/{pg_hash}.json"
+
+    # 使用文件锁检查和读取缓存文件
+    with file_lock:
+        if os.path.exists(temp_file_path):
+            try:
+                # 存在对应的提取结果
+                logger.info(f"找到缓存的提取结果：{pg_hash}")
+                with open(temp_file_path, "r", encoding="utf-8") as f:
+                    return json.load(f), None
+            except json.JSONDecodeError:
+                # 如果JSON文件损坏，删除它并重新处理
+                logger.warning(f"缓存文件损坏，重新处理：{pg_hash}")
+                os.remove(temp_file_path)
+
+    entity_list, rdf_triple_list = info_extract_from_str(
+        lpmm_entity_extract_llm,
+        lpmm_rdf_build_llm,
+        raw_data,
+    )
+    if entity_list is None or rdf_triple_list is None:
+        return None, pg_hash
+    doc_item = {
+        "idx": pg_hash,
+        "passage": raw_data,
+        "extracted_entities": entity_list,
+        "extracted_triples": rdf_triple_list,
+    }
+    # 保存临时提取结果
+    with file_lock:
+        try:
+            with open(temp_file_path, "w", encoding="utf-8") as f:
+                json.dump(doc_item, f, ensure_ascii=False, indent=4)
+        except Exception as e:
+            logger.error(f"保存缓存文件失败：{pg_hash}, 错误：{e}")
+            # 如果保存失败，确保不会留下损坏的文件
+            if os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
+            sys.exit(0)
+            return None, pg_hash
+    return doc_item, None
+
+
+def signal_handler(_signum, _frame):
+    """处理Ctrl+C信号"""
+    logger.info("\n接收到中断信号，正在优雅地关闭程序...")
+    sys.exit(0)
+
+
+def main():  # sourcery skip: comprehension-to-generator, extract-method
+    # 设置信号处理器
+    signal.signal(signal.SIGINT, signal_handler)
+    ensure_dirs()  # 确保目录存在
+    # 新增用户确认提示
+    print("=== 重要操作确认，请认真阅读以下内容哦 ===")
+    print("实体提取操作将会花费较多api余额和时间，建议在空闲时段执行。")
+    print("举例：600万字全剧情，提取选用deepseek v3 0324，消耗约40元，约3小时。")
+    print("建议使用硅基流动的非Pro模型")
+    print("或者使用可以用赠金抵扣的Pro模型")
+    print("请确保账户余额充足，并且在执行前确认无误。")
+    confirm = input("确认继续执行？(y/n): ").strip().lower()
+    if confirm != "y":
+        logger.info("用户取消操作")
+        print("操作已取消")
+        sys.exit(1)
+    print("\n" + "=" * 40 + "\n")
+    ensure_dirs()  # 确保目录存在
+    logger.info("--------进行信息提取--------\n")
+
+    # 加载原始数据
+    logger.info("正在加载原始数据")
+    all_sha256_list, all_raw_datas = load_raw_data()
+
+    failed_sha256 = []
+    open_ie_doc = []
+
+    workers = global_config.lpmm_knowledge.info_extraction_workers
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        future_to_hash = {
+            executor.submit(process_single_text, pg_hash, raw_data): pg_hash
+            for pg_hash, raw_data in zip(all_sha256_list, all_raw_datas, strict=False)
+        }
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            MofNCompleteColumn(),
+            "•",
+            TimeElapsedColumn(),
+            "<",
+            TimeRemainingColumn(),
+            transient=False,
+        ) as progress:
+            task = progress.add_task("正在进行提取：", total=len(future_to_hash))
+            try:
+                for future in as_completed(future_to_hash):
+                    if shutdown_event.is_set():
+                        for f in future_to_hash:
+                            if not f.done():
+                                f.cancel()
+                        break
+
+                    doc_item, failed_hash = future.result()
+                    if failed_hash:
+                        failed_sha256.append(failed_hash)
+                        logger.error(f"提取失败：{failed_hash}")
+                    elif doc_item:
+                        with open_ie_doc_lock:
+                            open_ie_doc.append(doc_item)
+                    progress.update(task, advance=1)
+            except KeyboardInterrupt:
+                logger.info("\n接收到中断信号，正在优雅地关闭程序...")
+                shutdown_event.set()
+                for f in future_to_hash:
+                    if not f.done():
+                        f.cancel()
+
+    # 合并所有文件的提取结果并保存
+    if open_ie_doc:
+        sum_phrase_chars = sum([len(e) for chunk in open_ie_doc for e in chunk["extracted_entities"]])
+        sum_phrase_words = sum([len(e.split()) for chunk in open_ie_doc for e in chunk["extracted_entities"]])
+        num_phrases = sum([len(chunk["extracted_entities"]) for chunk in open_ie_doc])
+        openie_obj = OpenIE(
+            open_ie_doc,
+            round(sum_phrase_chars / num_phrases, 4) if num_phrases else 0,
+            round(sum_phrase_words / num_phrases, 4) if num_phrases else 0,
+        )
+        # 输出文件名格式：MM-DD-HH-ss-openie.json
+        now = datetime.datetime.now()
+        filename = now.strftime("%m-%d-%H-%S-openie.json")
+        output_path = os.path.join(OPENIE_OUTPUT_DIR, filename)
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(
+                openie_obj.to_dict() if hasattr(openie_obj, "to_dict") else openie_obj.__dict__,
+                f,
+                ensure_ascii=False,
+                indent=4,
+            )
+        logger.info(f"信息提取结果已保存到: {output_path}")
+    else:
+        logger.warning("没有可保存的信息提取结果")
+
+    logger.info("--------信息提取完成--------")
+    logger.info(f"提取失败的文段SHA256：{failed_sha256}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/interest_value_analysis.py
+++ b/scripts/interest_value_analysis.py
@@ -0,0 +1,287 @@
+import time
+import sys
+import os
+from typing import Dict, List, Tuple, Optional
+from datetime import datetime
+# Add project root to Python path
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+from src.common.database.database_model import Messages, ChatStreams #noqa
+
+
+
+
+def get_chat_name(chat_id: str) -> str:
+    """Get chat name from chat_id by querying ChatStreams table directly"""
+    try:
+        chat_stream = ChatStreams.get_or_none(ChatStreams.stream_id == chat_id)
+        if chat_stream is None:
+            return f"未知聊天 ({chat_id})"
+        
+        if chat_stream.group_name:
+            return f"{chat_stream.group_name} ({chat_id})"
+        elif chat_stream.user_nickname:
+            return f"{chat_stream.user_nickname}的私聊 ({chat_id})"
+        else:
+            return f"未知聊天 ({chat_id})"
+    except Exception:
+        return f"查询失败 ({chat_id})"
+
+
+def format_timestamp(timestamp: float) -> str:
+    """Format timestamp to readable date string"""
+    try:
+        return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
+    except (ValueError, OSError):
+        return "未知时间"
+
+
+def calculate_interest_value_distribution(messages) -> Dict[str, int]:
+    """Calculate distribution of interest_value"""
+    distribution = {
+        '0.000-0.010': 0,
+        '0.010-0.050': 0,
+        '0.050-0.100': 0,
+        '0.100-0.500': 0,
+        '0.500-1.000': 0,
+        '1.000-2.000': 0,
+        '2.000-5.000': 0,
+        '5.000-10.000': 0,
+        '10.000+': 0
+    }
+    
+    for msg in messages:
+        if msg.interest_value is None or msg.interest_value == 0.0:
+            continue
+        
+        value = float(msg.interest_value)
+        if value < 0.010:
+            distribution['0.000-0.010'] += 1
+        elif value < 0.050:
+            distribution['0.010-0.050'] += 1
+        elif value < 0.100:
+            distribution['0.050-0.100'] += 1
+        elif value < 0.500:
+            distribution['0.100-0.500'] += 1
+        elif value < 1.000:
+            distribution['0.500-1.000'] += 1
+        elif value < 2.000:
+            distribution['1.000-2.000'] += 1
+        elif value < 5.000:
+            distribution['2.000-5.000'] += 1
+        elif value < 10.000:
+            distribution['5.000-10.000'] += 1
+        else:
+            distribution['10.000+'] += 1
+    
+    return distribution
+
+
+def get_interest_value_stats(messages) -> Dict[str, float]:
+    """Calculate basic statistics for interest_value"""
+    values = [float(msg.interest_value) for msg in messages if msg.interest_value is not None and msg.interest_value != 0.0]
+    
+    if not values:
+        return {
+            'count': 0,
+            'min': 0,
+            'max': 0,
+            'avg': 0,
+            'median': 0
+        }
+    
+    values.sort()
+    count = len(values)
+    
+    return {
+        'count': count,
+        'min': min(values),
+        'max': max(values),
+        'avg': sum(values) / count,
+        'median': values[count // 2] if count % 2 == 1 else (values[count // 2 - 1] + values[count // 2]) / 2
+    }
+
+
+def get_available_chats() -> List[Tuple[str, str, int]]:
+    """Get all available chats with message counts"""
+    try:
+        # 获取所有有消息的chat_id
+        chat_counts = {}
+        for msg in Messages.select(Messages.chat_id).distinct():
+            chat_id = msg.chat_id
+            count = Messages.select().where(
+                (Messages.chat_id == chat_id) & 
+                (Messages.interest_value.is_null(False)) &
+                (Messages.interest_value != 0.0)
+            ).count()
+            if count > 0:
+                chat_counts[chat_id] = count
+        
+        # 获取聊天名称
+        result = []
+        for chat_id, count in chat_counts.items():
+            chat_name = get_chat_name(chat_id)
+            result.append((chat_id, chat_name, count))
+        
+        # 按消息数量排序
+        result.sort(key=lambda x: x[2], reverse=True)
+        return result
+    except Exception as e:
+        print(f"获取聊天列表失败: {e}")
+        return []
+
+
+def get_time_range_input() -> Tuple[Optional[float], Optional[float]]:
+    """Get time range input from user"""
+    print("\n时间范围选择:")
+    print("1. 最近1天")
+    print("2. 最近3天") 
+    print("3. 最近7天")
+    print("4. 最近30天")
+    print("5. 自定义时间范围")
+    print("6. 不限制时间")
+    
+    choice = input("请选择时间范围 (1-6): ").strip()
+    
+    now = time.time()
+    
+    if choice == "1":
+        return now - 24*3600, now
+    elif choice == "2":
+        return now - 3*24*3600, now
+    elif choice == "3":
+        return now - 7*24*3600, now
+    elif choice == "4":
+        return now - 30*24*3600, now
+    elif choice == "5":
+        print("请输入开始时间 (格式: YYYY-MM-DD HH:MM:SS):")
+        start_str = input().strip()
+        print("请输入结束时间 (格式: YYYY-MM-DD HH:MM:SS):")
+        end_str = input().strip()
+        
+        try:
+            start_time = datetime.strptime(start_str, "%Y-%m-%d %H:%M:%S").timestamp()
+            end_time = datetime.strptime(end_str, "%Y-%m-%d %H:%M:%S").timestamp()
+            return start_time, end_time
+        except ValueError:
+            print("时间格式错误，将不限制时间范围")
+            return None, None
+    else:
+        return None, None
+
+
+def analyze_interest_values(chat_id: Optional[str] = None, start_time: Optional[float] = None, end_time: Optional[float] = None) -> None:
+    """Analyze interest values with optional filters"""
+    
+    # 构建查询条件
+    query = Messages.select().where(
+        (Messages.interest_value.is_null(False)) &
+        (Messages.interest_value != 0.0)
+    )
+    
+    if chat_id:
+        query = query.where(Messages.chat_id == chat_id)
+    
+    if start_time:
+        query = query.where(Messages.time >= start_time)
+    
+    if end_time:
+        query = query.where(Messages.time <= end_time)
+    
+    messages = list(query)
+    
+    if not messages:
+        print("没有找到符合条件的消息")
+        return
+    
+    # 计算统计信息
+    distribution = calculate_interest_value_distribution(messages)
+    stats = get_interest_value_stats(messages)
+    
+    # 显示结果
+    print("\n=== Interest Value 分析结果 ===")
+    if chat_id:
+        print(f"聊天: {get_chat_name(chat_id)}")
+    else:
+        print("聊天: 全部聊天")
+    
+    if start_time and end_time:
+        print(f"时间范围: {format_timestamp(start_time)} 到 {format_timestamp(end_time)}")
+    elif start_time:
+        print(f"时间范围: {format_timestamp(start_time)} 之后")
+    elif end_time:
+        print(f"时间范围: {format_timestamp(end_time)} 之前")
+    else:
+        print("时间范围: 不限制")
+    
+    print("\n基本统计:")
+    print(f"有效消息数量: {stats['count']} (排除null和0值)")
+    print(f"最小值: {stats['min']:.3f}")
+    print(f"最大值: {stats['max']:.3f}")
+    print(f"平均值: {stats['avg']:.3f}")
+    print(f"中位数: {stats['median']:.3f}")
+    
+    print("\nInterest Value 分布:")
+    total = stats['count']
+    for range_name, count in distribution.items():
+        if count > 0:
+            percentage = count / total * 100
+            print(f"{range_name}: {count} ({percentage:.2f}%)")
+
+
+def interactive_menu() -> None:
+    """Interactive menu for interest value analysis"""
+    
+    while True:
+        print("\n" + "="*50)
+        print("Interest Value 分析工具")
+        print("="*50)
+        print("1. 分析全部聊天")
+        print("2. 选择特定聊天分析")
+        print("q. 退出")
+        
+        choice = input("\n请选择分析模式 (1-2, q): ").strip()
+        
+        if choice.lower() == 'q':
+            print("再见！")
+            break
+        
+        chat_id = None
+        
+        if choice == "2":
+            # 显示可用的聊天列表
+            chats = get_available_chats()
+            if not chats:
+                print("没有找到有interest_value数据的聊天")
+                continue
+            
+            print(f"\n可用的聊天 (共{len(chats)}个):")
+            for i, (_cid, name, count) in enumerate(chats, 1):
+                print(f"{i}. {name} ({count}条有效消息)")
+            
+            try:
+                chat_choice = int(input(f"\n请选择聊天 (1-{len(chats)}): ").strip())
+                if 1 <= chat_choice <= len(chats):
+                    chat_id = chats[chat_choice - 1][0]
+                else:
+                    print("无效选择")
+                    continue
+            except ValueError:
+                print("请输入有效数字")
+                continue
+        
+        elif choice != "1":
+            print("无效选择")
+            continue
+        
+        # 获取时间范围
+        start_time, end_time = get_time_range_input()
+        
+        # 执行分析
+        analyze_interest_values(chat_id, start_time, end_time)
+        
+        input("\n按回车键继续...")
+
+
+if __name__ == "__main__":
+    interactive_menu() 
--- a/scripts/log_viewer_optimized.py
+++ b/scripts/log_viewer_optimized.py
--- a/scripts/manifest_tool.py
+++ b/scripts/manifest_tool.py
@@ -0,0 +1,237 @@
+"""
+插件Manifest管理命令行工具
+
+提供插件manifest文件的创建、验证和管理功能
+"""
+
+import os
+import sys
+import argparse
+import json
+from pathlib import Path
+from src.common.logger import get_logger
+from src.plugin_system.utils.manifest_utils import (
+    ManifestValidator,
+)
+
+# 添加项目根目录到Python路径
+project_root = Path(__file__).parent.parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+
+logger = get_logger("manifest_tool")
+
+
+def create_minimal_manifest(plugin_dir: str, plugin_name: str, description: str = "", author: str = "") -> bool:
+    """创建最小化的manifest文件
+
+    Args:
+        plugin_dir: 插件目录
+        plugin_name: 插件名称
+        description: 插件描述
+        author: 插件作者
+
+    Returns:
+        bool: 是否创建成功
+    """
+    manifest_path = os.path.join(plugin_dir, "_manifest.json")
+
+    if os.path.exists(manifest_path):
+        print(f"❌ Manifest文件已存在: {manifest_path}")
+        return False
+
+    # 创建最小化manifest
+    minimal_manifest = {
+        "manifest_version": 1,
+        "name": plugin_name,
+        "version": "1.0.0",
+        "description": description or f"{plugin_name}插件",
+        "author": {"name": author or "Unknown"},
+    }
+
+    try:
+        with open(manifest_path, "w", encoding="utf-8") as f:
+            json.dump(minimal_manifest, f, ensure_ascii=False, indent=2)
+        print(f"✅ 已创建最小化manifest文件: {manifest_path}")
+        return True
+    except Exception as e:
+        print(f"❌ 创建manifest文件失败: {e}")
+        return False
+
+
+def create_complete_manifest(plugin_dir: str, plugin_name: str) -> bool:
+    """创建完整的manifest模板文件
+
+    Args:
+        plugin_dir: 插件目录
+        plugin_name: 插件名称
+
+    Returns:
+        bool: 是否创建成功
+    """
+    manifest_path = os.path.join(plugin_dir, "_manifest.json")
+
+    if os.path.exists(manifest_path):
+        print(f"❌ Manifest文件已存在: {manifest_path}")
+        return False
+
+    # 创建完整模板
+    complete_manifest = {
+        "manifest_version": 1,
+        "name": plugin_name,
+        "version": "1.0.0",
+        "description": f"{plugin_name}插件描述",
+        "author": {"name": "插件作者", "url": "https://github.com/your-username"},
+        "license": "MIT",
+        "host_application": {"min_version": "1.0.0", "max_version": "4.0.0"},
+        "homepage_url": "https://github.com/your-repo",
+        "repository_url": "https://github.com/your-repo",
+        "keywords": ["keyword1", "keyword2"],
+        "categories": ["Category1"],
+        "default_locale": "zh-CN",
+        "locales_path": "_locales",
+        "plugin_info": {
+            "is_built_in": False,
+            "plugin_type": "general",
+            "components": [{"type": "action", "name": "sample_action", "description": "示例动作组件"}],
+        },
+    }
+
+    try:
+        with open(manifest_path, "w", encoding="utf-8") as f:
+            json.dump(complete_manifest, f, ensure_ascii=False, indent=2)
+        print(f"✅ 已创建完整manifest模板: {manifest_path}")
+        print("💡 请根据实际情况修改manifest文件中的内容")
+        return True
+    except Exception as e:
+        print(f"❌ 创建manifest文件失败: {e}")
+        return False
+
+
+def validate_manifest_file(plugin_dir: str) -> bool:
+    """验证manifest文件
+
+    Args:
+        plugin_dir: 插件目录
+
+    Returns:
+        bool: 是否验证通过
+    """
+    manifest_path = os.path.join(plugin_dir, "_manifest.json")
+
+    if not os.path.exists(manifest_path):
+        print(f"❌ 未找到manifest文件: {manifest_path}")
+        return False
+
+    try:
+        with open(manifest_path, "r", encoding="utf-8") as f:
+            manifest_data = json.load(f)
+
+        validator = ManifestValidator()
+        is_valid = validator.validate_manifest(manifest_data)
+
+        # 显示验证结果
+        print("📋 Manifest验证结果:")
+        print(validator.get_validation_report())
+
+        if is_valid:
+            print("✅ Manifest文件验证通过")
+        else:
+            print("❌ Manifest文件验证失败")
+
+        return is_valid
+
+    except json.JSONDecodeError as e:
+        print(f"❌ Manifest文件格式错误: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ 验证过程中发生错误: {e}")
+        return False
+
+
+def scan_plugins_without_manifest(root_dir: str) -> None:
+    """扫描缺少manifest文件的插件
+
+    Args:
+        root_dir: 扫描的根目录
+    """
+    print(f"🔍 扫描目录: {root_dir}")
+
+    plugins_without_manifest = []
+
+    for root, dirs, files in os.walk(root_dir):
+        # 跳过隐藏目录和__pycache__
+        dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__"]
+
+        # 检查是否包含plugin.py文件（标识为插件目录）
+        if "plugin.py" in files:
+            manifest_path = os.path.join(root, "_manifest.json")
+            if not os.path.exists(manifest_path):
+                plugins_without_manifest.append(root)
+
+    if plugins_without_manifest:
+        print(f"❌ 发现 {len(plugins_without_manifest)} 个插件缺少manifest文件:")
+        for plugin_dir in plugins_without_manifest:
+            plugin_name = os.path.basename(plugin_dir)
+            print(f"  - {plugin_name}: {plugin_dir}")
+        print("💡 使用 'python manifest_tool.py create-minimal <插件目录>' 创建manifest文件")
+    else:
+        print("✅ 所有插件都有manifest文件")
+
+
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(description="插件Manifest管理工具")
+    subparsers = parser.add_subparsers(dest="command", help="可用命令")
+
+    # 创建最小化manifest命令
+    create_minimal_parser = subparsers.add_parser("create-minimal", help="创建最小化manifest文件")
+    create_minimal_parser.add_argument("plugin_dir", help="插件目录路径")
+    create_minimal_parser.add_argument("--name", help="插件名称")
+    create_minimal_parser.add_argument("--description", help="插件描述")
+    create_minimal_parser.add_argument("--author", help="插件作者")
+
+    # 创建完整manifest命令
+    create_complete_parser = subparsers.add_parser("create-complete", help="创建完整manifest模板")
+    create_complete_parser.add_argument("plugin_dir", help="插件目录路径")
+    create_complete_parser.add_argument("--name", help="插件名称")
+
+    # 验证manifest命令
+    validate_parser = subparsers.add_parser("validate", help="验证manifest文件")
+    validate_parser.add_argument("plugin_dir", help="插件目录路径")
+
+    # 扫描插件命令
+    scan_parser = subparsers.add_parser("scan", help="扫描缺少manifest的插件")
+    scan_parser.add_argument("root_dir", help="扫描的根目录路径")
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        return
+
+    try:
+        if args.command == "create-minimal":
+            plugin_name = args.name or os.path.basename(os.path.abspath(args.plugin_dir))
+            success = create_minimal_manifest(args.plugin_dir, plugin_name, args.description or "", args.author or "")
+            sys.exit(0 if success else 1)
+
+        elif args.command == "create-complete":
+            plugin_name = args.name or os.path.basename(os.path.abspath(args.plugin_dir))
+            success = create_complete_manifest(args.plugin_dir, plugin_name)
+            sys.exit(0 if success else 1)
+
+        elif args.command == "validate":
+            success = validate_manifest_file(args.plugin_dir)
+            sys.exit(0 if success else 1)
+
+        elif args.command == "scan":
+            scan_plugins_without_manifest(args.root_dir)
+
+    except Exception as e:
+        print(f"❌ 执行命令时发生错误: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/mongodb_to_sqlite.py
+++ b/scripts/mongodb_to_sqlite.py
@@ -0,0 +1,920 @@
+import os
+import json
+import sys  # 新增系统模块导入
+
+# import time
+import pickle
+from pathlib import Path
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from typing import Dict, Any, List, Optional, Type
+from dataclasses import dataclass, field
+from datetime import datetime
+from pymongo import MongoClient
+from pymongo.errors import ConnectionFailure
+from peewee import Model, Field, IntegrityError
+
+# Rich 进度条和显示组件
+from rich.console import Console
+from rich.progress import (
+    Progress,
+    TextColumn,
+    BarColumn,
+    TaskProgressColumn,
+    TimeRemainingColumn,
+    TimeElapsedColumn,
+    SpinnerColumn,
+)
+from rich.table import Table
+from rich.panel import Panel
+# from rich.text import Text
+
+from src.common.database.database import db
+from src.common.database.database_model import (
+    ChatStreams,
+    Emoji,
+    Messages,
+    Images,
+    ImageDescriptions,
+    PersonInfo,
+    Knowledges,
+    ThinkingLog,
+    GraphNodes,
+    GraphEdges,
+)
+from src.common.logger import get_logger
+
+logger = get_logger("mongodb_to_sqlite")
+
+ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+
+
+@dataclass
+class MigrationConfig:
+    """迁移配置类"""
+
+    mongo_collection: str
+    target_model: Type[Model]
+    field_mapping: Dict[str, str]
+    batch_size: int = 500
+    enable_validation: bool = True
+    skip_duplicates: bool = True
+    unique_fields: List[str] = field(default_factory=list)  # 用于重复检查的字段
+
+
+# 数据验证相关类已移除 - 用户要求不要数据验证
+
+
+@dataclass
+class MigrationCheckpoint:
+    """迁移断点数据"""
+
+    collection_name: str
+    processed_count: int
+    last_processed_id: Any
+    timestamp: datetime
+    batch_errors: List[Dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class MigrationStats:
+    """迁移统计信息"""
+
+    total_documents: int = 0
+    processed_count: int = 0
+    success_count: int = 0
+    error_count: int = 0
+    skipped_count: int = 0
+    duplicate_count: int = 0
+    validation_errors: int = 0
+    batch_insert_count: int = 0
+    errors: List[Dict[str, Any]] = field(default_factory=list)
+    start_time: Optional[datetime] = None
+    end_time: Optional[datetime] = None
+
+    def add_error(self, doc_id: Any, error: str, doc_data: Optional[Dict] = None):
+        """添加错误记录"""
+        self.errors.append(
+            {"doc_id": str(doc_id), "error": error, "timestamp": datetime.now().isoformat(), "doc_data": doc_data}
+        )
+        self.error_count += 1
+
+    def add_validation_error(self, doc_id: Any, field: str, error: str):
+        """添加验证错误"""
+        self.add_error(doc_id, f"验证失败 - {field}: {error}")
+        self.validation_errors += 1
+
+
+class MongoToSQLiteMigrator:
+    """MongoDB到SQLite数据迁移器 - 使用Peewee ORM"""
+
+    def __init__(self, mongo_uri: Optional[str] = None, database_name: Optional[str] = None):
+        self.database_name = database_name or os.getenv("DATABASE_NAME", "MegBot")
+        self.mongo_uri = mongo_uri or self._build_mongo_uri()
+        self.mongo_client: Optional[MongoClient] = None
+        self.mongo_db = None
+
+        # 迁移配置
+        self.migration_configs = self._initialize_migration_configs()
+
+        # 进度条控制台
+        self.console = Console()
+        # 检查点目录
+        self.checkpoint_dir = Path(os.path.join(ROOT_PATH, "data", "checkpoints"))
+        self.checkpoint_dir.mkdir(exist_ok=True)
+
+        # 验证规则已禁用
+        self.validation_rules = self._initialize_validation_rules()
+
+    def _build_mongo_uri(self) -> str:
+        """构建MongoDB连接URI"""
+        if mongo_uri := os.getenv("MONGODB_URI"):
+            return mongo_uri
+
+        user = os.getenv("MONGODB_USER")
+        password = os.getenv("MONGODB_PASS")
+        host = os.getenv("MONGODB_HOST", "localhost")
+        port = os.getenv("MONGODB_PORT", "27017")
+        auth_source = os.getenv("MONGODB_AUTH_SOURCE", "admin")
+
+        if user and password:
+            return f"mongodb://{user}:{password}@{host}:{port}/{self.database_name}?authSource={auth_source}"
+        else:
+            return f"mongodb://{host}:{port}/{self.database_name}"
+
+    def _initialize_migration_configs(self) -> List[MigrationConfig]:
+        """初始化迁移配置"""
+        return [  # 表情包迁移配置
+            MigrationConfig(
+                mongo_collection="emoji",
+                target_model=Emoji,
+                field_mapping={
+                    "full_path": "full_path",
+                    "format": "format",
+                    "hash": "emoji_hash",
+                    "description": "description",
+                    "emotion": "emotion",
+                    "usage_count": "usage_count",
+                    "last_used_time": "last_used_time",
+                    # record_time字段将在转换时自动设置为当前时间
+                },
+                enable_validation=False,  # 禁用数据验证
+                unique_fields=["full_path", "emoji_hash"],
+            ),
+            # 聊天流迁移配置
+            MigrationConfig(
+                mongo_collection="chat_streams",
+                target_model=ChatStreams,
+                field_mapping={
+                    "stream_id": "stream_id",
+                    "create_time": "create_time",
+                    "group_info.platform": "group_platform",  # 由于Mongodb处理私聊时会让group_info值为null，而新的数据库不允许为null，所以私聊聊天流是没法迁移的，等更新吧。
+                    "group_info.group_id": "group_id",  # 同上
+                    "group_info.group_name": "group_name",  # 同上
+                    "last_active_time": "last_active_time",
+                    "platform": "platform",
+                    "user_info.platform": "user_platform",
+                    "user_info.user_id": "user_id",
+                    "user_info.user_nickname": "user_nickname",
+                    "user_info.user_cardname": "user_cardname",
+                },
+                enable_validation=False,  # 禁用数据验证
+                unique_fields=["stream_id"],
+            ),
+            # 消息迁移配置
+            MigrationConfig(
+                mongo_collection="messages",
+                target_model=Messages,
+                field_mapping={
+                    "message_id": "message_id",
+                    "time": "time",
+                    "chat_id": "chat_id",
+                    "chat_info.stream_id": "chat_info_stream_id",
+                    "chat_info.platform": "chat_info_platform",
+                    "chat_info.user_info.platform": "chat_info_user_platform",
+                    "chat_info.user_info.user_id": "chat_info_user_id",
+                    "chat_info.user_info.user_nickname": "chat_info_user_nickname",
+                    "chat_info.user_info.user_cardname": "chat_info_user_cardname",
+                    "chat_info.group_info.platform": "chat_info_group_platform",
+                    "chat_info.group_info.group_id": "chat_info_group_id",
+                    "chat_info.group_info.group_name": "chat_info_group_name",
+                    "chat_info.create_time": "chat_info_create_time",
+                    "chat_info.last_active_time": "chat_info_last_active_time",
+                    "user_info.platform": "user_platform",
+                    "user_info.user_id": "user_id",
+                    "user_info.user_nickname": "user_nickname",
+                    "user_info.user_cardname": "user_cardname",
+                    "processed_plain_text": "processed_plain_text",
+                    "memorized_times": "memorized_times",
+                },
+                enable_validation=False,  # 禁用数据验证
+                unique_fields=["message_id"],
+            ),
+            # 图片迁移配置
+            MigrationConfig(
+                mongo_collection="images",
+                target_model=Images,
+                field_mapping={
+                    "hash": "emoji_hash",
+                    "description": "description",
+                    "path": "path",
+                    "timestamp": "timestamp",
+                    "type": "type",
+                },
+                unique_fields=["path"],
+            ),
+            # 图片描述迁移配置
+            MigrationConfig(
+                mongo_collection="image_descriptions",
+                target_model=ImageDescriptions,
+                field_mapping={
+                    "type": "type",
+                    "hash": "image_description_hash",
+                    "description": "description",
+                    "timestamp": "timestamp",
+                },
+                unique_fields=["image_description_hash", "type"],
+            ),
+            # 个人信息迁移配置
+            MigrationConfig(
+                mongo_collection="person_info",
+                target_model=PersonInfo,
+                field_mapping={
+                    "person_id": "person_id",
+                    "person_name": "person_name",
+                    "name_reason": "name_reason",
+                    "platform": "platform",
+                    "user_id": "user_id",
+                    "nickname": "nickname",
+                    "relationship_value": "relationship_value",
+                    "konw_time": "know_time",
+                },
+                unique_fields=["person_id"],
+            ),
+            # 知识库迁移配置
+            MigrationConfig(
+                mongo_collection="knowledges",
+                target_model=Knowledges,
+                field_mapping={"content": "content", "embedding": "embedding"},
+                unique_fields=["content"],  # 假设内容唯一
+            ),
+            # 思考日志迁移配置
+            MigrationConfig(
+                mongo_collection="thinking_log",
+                target_model=ThinkingLog,
+                field_mapping={
+                    "chat_id": "chat_id",
+                    "trigger_text": "trigger_text",
+                    "response_text": "response_text",
+                    "trigger_info": "trigger_info_json",
+                    "response_info": "response_info_json",
+                    "timing_results": "timing_results_json",
+                    "chat_history": "chat_history_json",
+                    "chat_history_in_thinking": "chat_history_in_thinking_json",
+                    "chat_history_after_response": "chat_history_after_response_json",
+                    "heartflow_data": "heartflow_data_json",
+                    "reasoning_data": "reasoning_data_json",
+                },
+                unique_fields=["chat_id", "trigger_text"],
+            ),
+            # 图节点迁移配置
+            MigrationConfig(
+                mongo_collection="graph_data.nodes",
+                target_model=GraphNodes,
+                field_mapping={
+                    "concept": "concept",
+                    "memory_items": "memory_items",
+                    "hash": "hash",
+                    "created_time": "created_time",
+                    "last_modified": "last_modified",
+                },
+                unique_fields=["concept"],
+            ),
+            # 图边迁移配置
+            MigrationConfig(
+                mongo_collection="graph_data.edges",
+                target_model=GraphEdges,
+                field_mapping={
+                    "source": "source",
+                    "target": "target",
+                    "strength": "strength",
+                    "hash": "hash",
+                    "created_time": "created_time",
+                    "last_modified": "last_modified",
+                },
+                unique_fields=["source", "target"],  # 组合唯一性
+            ),
+        ]
+
+    def _initialize_validation_rules(self) -> Dict[str, Any]:
+        """数据验证已禁用 - 返回空字典"""
+        return {}
+
+    def connect_mongodb(self) -> bool:
+        """连接到MongoDB"""
+        try:
+            self.mongo_client = MongoClient(
+                self.mongo_uri, serverSelectionTimeoutMS=5000, connectTimeoutMS=10000, maxPoolSize=10
+            )
+
+            # 测试连接
+            self.mongo_client.admin.command("ping")
+            self.mongo_db = self.mongo_client[self.database_name]
+
+            logger.info(f"成功连接到MongoDB: {self.database_name}")
+            return True
+
+        except ConnectionFailure as e:
+            logger.error(f"MongoDB连接失败: {e}")
+            return False
+        except Exception as e:
+            logger.error(f"MongoDB连接异常: {e}")
+            return False
+
+    def disconnect_mongodb(self):
+        """断开MongoDB连接"""
+        if self.mongo_client:
+            self.mongo_client.close()
+            logger.info("MongoDB连接已关闭")
+
+    def _get_nested_value(self, document: Dict[str, Any], field_path: str) -> Any:
+        """获取嵌套字段的值"""
+        if "." not in field_path:
+            return document.get(field_path)
+
+        parts = field_path.split(".")
+        value = document
+
+        for part in parts:
+            if isinstance(value, dict):
+                value = value.get(part)
+            else:
+                return None
+
+            if value is None:
+                break
+
+        return value
+
+    def _convert_field_value(self, value: Any, target_field: Field) -> Any:
+        """根据目标字段类型转换值"""
+        if value is None:
+            return None
+
+        field_type = target_field.__class__.__name__
+
+        try:
+            if target_field.name == "record_time" and field_type == "DateTimeField":
+                return datetime.now()
+
+            if field_type in ["CharField", "TextField"]:
+                if isinstance(value, (list, dict)):
+                    return json.dumps(value, ensure_ascii=False)
+                return str(value) if value is not None else ""
+
+            elif field_type == "IntegerField":
+                if isinstance(value, str):
+                    # 处理字符串数字
+                    clean_value = value.strip()
+                    if clean_value.replace(".", "").replace("-", "").isdigit():
+                        return int(float(clean_value))
+                    return 0
+                return int(value) if value is not None else 0
+
+            elif field_type in ["FloatField", "DoubleField"]:
+                return float(value) if value is not None else 0.0
+
+            elif field_type == "BooleanField":
+                if isinstance(value, str):
+                    return value.lower() in ("true", "1", "yes", "on")
+                return bool(value)
+
+            elif field_type == "DateTimeField":
+                if isinstance(value, (int, float)):
+                    return datetime.fromtimestamp(value)
+                elif isinstance(value, str):
+                    try:
+                        # 尝试解析ISO格式日期
+                        return datetime.fromisoformat(value.replace("Z", "+00:00"))
+                    except ValueError:
+                        try:
+                            # 尝试解析时间戳字符串
+                            return datetime.fromtimestamp(float(value))
+                        except ValueError:
+                            return datetime.now()
+                return datetime.now()
+
+            return value
+
+        except (ValueError, TypeError) as e:
+            logger.warning(f"字段值转换失败 ({field_type}): {value} -> {e}")
+            return self._get_default_value_for_field(target_field)
+
+    def _get_default_value_for_field(self, field: Field) -> Any:
+        """获取字段的默认值"""
+        field_type = field.__class__.__name__
+
+        if hasattr(field, "default") and field.default is not None:
+            return field.default
+
+        if field.null:
+            return None
+
+        # 根据字段类型返回默认值
+        if field_type in ["CharField", "TextField"]:
+            return ""
+        elif field_type == "IntegerField":
+            return 0
+        elif field_type in ["FloatField", "DoubleField"]:
+            return 0.0
+        elif field_type == "BooleanField":
+            return False
+        elif field_type == "DateTimeField":
+            return datetime.now()
+
+        return None
+
+    def _validate_data(self, collection_name: str, data: Dict[str, Any], doc_id: Any, stats: MigrationStats) -> bool:
+        """数据验证已禁用 - 始终返回True"""
+        return True
+
+    def _save_checkpoint(self, collection_name: str, processed_count: int, last_id: Any):
+        """保存迁移断点"""
+        checkpoint = MigrationCheckpoint(
+            collection_name=collection_name,
+            processed_count=processed_count,
+            last_processed_id=last_id,
+            timestamp=datetime.now(),
+        )
+
+        checkpoint_file = self.checkpoint_dir / f"{collection_name}_checkpoint.pkl"
+        try:
+            with open(checkpoint_file, "wb") as f:
+                pickle.dump(checkpoint, f)
+        except Exception as e:
+            logger.warning(f"保存断点失败: {e}")
+
+    def _load_checkpoint(self, collection_name: str) -> Optional[MigrationCheckpoint]:
+        """加载迁移断点"""
+        checkpoint_file = self.checkpoint_dir / f"{collection_name}_checkpoint.pkl"
+        if not checkpoint_file.exists():
+            return None
+
+        try:
+            with open(checkpoint_file, "rb") as f:
+                return pickle.load(f)
+        except Exception as e:
+            logger.warning(f"加载断点失败: {e}")
+            return None
+
+    def _batch_insert(self, model: Type[Model], data_list: List[Dict[str, Any]]) -> int:
+        """批量插入数据"""
+        if not data_list:
+            return 0
+
+        success_count = 0
+        try:
+            with db.atomic():
+                # 分批插入，避免SQL语句过长
+                batch_size = 100
+                for i in range(0, len(data_list), batch_size):
+                    batch = data_list[i : i + batch_size]
+                    model.insert_many(batch).execute()
+                    success_count += len(batch)
+        except Exception as e:
+            logger.error(f"批量插入失败: {e}")
+            # 如果批量插入失败，尝试逐个插入
+            for data in data_list:
+                try:
+                    model.create(**data)
+                    success_count += 1
+                except Exception:
+                    pass  # 忽略单个插入失败
+
+        return success_count
+
+    def _check_duplicate_by_unique_fields(
+        self, model: Type[Model], data: Dict[str, Any], unique_fields: List[str]
+    ) -> bool:
+        """根据唯一字段检查重复"""
+        if not unique_fields:
+            return False
+
+        try:
+            query = model.select()
+            for field_name in unique_fields:
+                if field_name in data and data[field_name] is not None:
+                    field_obj = getattr(model, field_name)
+                    query = query.where(field_obj == data[field_name])
+
+            return query.exists()
+        except Exception as e:
+            logger.debug(f"重复检查失败: {e}")
+            return False
+
+    def _create_model_instance(self, model: Type[Model], data: Dict[str, Any]) -> Optional[Model]:
+        """使用ORM创建模型实例"""
+        try:
+            # 过滤掉不存在的字段
+            valid_data = {}
+            for field_name, value in data.items():
+                if hasattr(model, field_name):
+                    valid_data[field_name] = value
+                else:
+                    logger.debug(f"跳过未知字段: {field_name}")
+
+            # 创建实例
+            instance = model.create(**valid_data)
+            return instance
+
+        except IntegrityError as e:
+            # 处理唯一约束冲突等完整性错误
+            logger.debug(f"完整性约束冲突: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"创建模型实例失败: {e}")
+            return None
+
+    def migrate_collection(self, config: MigrationConfig) -> MigrationStats:
+        """迁移单个集合 - 使用优化的批量插入和进度条"""
+        stats = MigrationStats()
+        stats.start_time = datetime.now()
+
+        # 检查是否有断点
+        checkpoint = self._load_checkpoint(config.mongo_collection)
+        start_from_id = checkpoint.last_processed_id if checkpoint else None
+        if checkpoint:
+            stats.processed_count = checkpoint.processed_count
+            logger.info(f"从断点恢复: 已处理 {checkpoint.processed_count} 条记录")
+
+        logger.info(f"开始迁移: {config.mongo_collection} -> {config.target_model._meta.table_name}")
+
+        try:
+            # 获取MongoDB集合
+            mongo_collection = self.mongo_db[config.mongo_collection]
+
+            # 构建查询条件（用于断点恢复）
+            query = {}
+            if start_from_id:
+                query = {"_id": {"$gt": start_from_id}}
+
+            stats.total_documents = mongo_collection.count_documents(query)
+
+            if stats.total_documents == 0:
+                logger.warning(f"集合 {config.mongo_collection} 为空，跳过迁移")
+                return stats
+
+            logger.info(f"待迁移文档数量: {stats.total_documents}")
+
+            # 创建Rich进度条
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                TimeElapsedColumn(),
+                TimeRemainingColumn(),
+                console=self.console,
+                refresh_per_second=10,
+            ) as progress:
+                task = progress.add_task(f"迁移 {config.mongo_collection}", total=stats.total_documents)
+                # 批量处理数据
+                batch_data = []
+                batch_count = 0
+                last_processed_id = None
+
+                for mongo_doc in mongo_collection.find(query).batch_size(config.batch_size):
+                    try:
+                        doc_id = mongo_doc.get("_id", "unknown")
+                        last_processed_id = doc_id
+
+                        # 构建目标数据
+                        target_data = {}
+                        for mongo_field, sqlite_field in config.field_mapping.items():
+                            value = self._get_nested_value(mongo_doc, mongo_field)
+
+                            # 获取目标字段对象并转换类型
+                            if hasattr(config.target_model, sqlite_field):
+                                field_obj = getattr(config.target_model, sqlite_field)
+                                converted_value = self._convert_field_value(value, field_obj)
+                                target_data[sqlite_field] = converted_value
+
+                        # 数据验证已禁用
+                        # if config.enable_validation:
+                        #     if not self._validate_data(config.mongo_collection, target_data, doc_id, stats):
+                        #         stats.skipped_count += 1
+                        #         continue
+
+                        # 重复检查
+                        if config.skip_duplicates and self._check_duplicate_by_unique_fields(
+                            config.target_model, target_data, config.unique_fields
+                        ):
+                            stats.duplicate_count += 1
+                            stats.skipped_count += 1
+                            logger.debug(f"跳过重复记录: {doc_id}")
+                            continue
+
+                        # 添加到批量数据
+                        batch_data.append(target_data)
+                        stats.processed_count += 1
+
+                        # 执行批量插入
+                        if len(batch_data) >= config.batch_size:
+                            success_count = self._batch_insert(config.target_model, batch_data)
+                            stats.success_count += success_count
+                            stats.batch_insert_count += 1
+
+                            # 保存断点
+                            self._save_checkpoint(config.mongo_collection, stats.processed_count, last_processed_id)
+
+                            batch_data.clear()
+                            batch_count += 1
+
+                            # 更新进度条
+                            progress.update(task, advance=config.batch_size)
+
+                    except Exception as e:
+                        doc_id = mongo_doc.get("_id", "unknown")
+                        stats.add_error(doc_id, f"处理文档异常: {e}", mongo_doc)
+                        logger.error(f"处理文档失败 (ID: {doc_id}): {e}")
+
+                # 处理剩余的批量数据
+                if batch_data:
+                    success_count = self._batch_insert(config.target_model, batch_data)
+                    stats.success_count += success_count
+                    stats.batch_insert_count += 1
+                    progress.update(task, advance=len(batch_data))
+
+                # 完成进度条
+                progress.update(task, completed=stats.total_documents)
+
+            stats.end_time = datetime.now()
+            duration = stats.end_time - stats.start_time
+
+            logger.info(
+                f"迁移完成: {config.mongo_collection} -> {config.target_model._meta.table_name}\n"
+                f"总计: {stats.total_documents}, 成功: {stats.success_count}, "
+                f"错误: {stats.error_count}, 跳过: {stats.skipped_count}, 重复: {stats.duplicate_count}\n"
+                f"耗时: {duration.total_seconds():.2f}秒, 批量插入次数: {stats.batch_insert_count}"
+            )
+
+            # 清理断点文件
+            checkpoint_file = self.checkpoint_dir / f"{config.mongo_collection}_checkpoint.pkl"
+            if checkpoint_file.exists():
+                checkpoint_file.unlink()
+
+        except Exception as e:
+            logger.error(f"迁移集合 {config.mongo_collection} 时发生异常: {e}")
+            stats.add_error("collection_error", str(e))
+
+        return stats
+
+    def migrate_all(self) -> Dict[str, MigrationStats]:
+        """执行所有迁移任务"""
+        logger.info("开始执行数据库迁移...")
+
+        if not self.connect_mongodb():
+            logger.error("无法连接到MongoDB，迁移终止")
+            return {}
+
+        all_stats = {}
+
+        try:
+            # 创建总体进度表格
+            total_collections = len(self.migration_configs)
+            self.console.print(
+                Panel(
+                    f"[bold blue]MongoDB 到 SQLite 数据迁移[/bold blue]\n"
+                    f"[yellow]总集合数: {total_collections}[/yellow]",
+                    title="迁移开始",
+                    expand=False,
+                )
+            )
+            for idx, config in enumerate(self.migration_configs, 1):
+                self.console.print(
+                    f"\n[bold green]正在处理集合 {idx}/{total_collections}: {config.mongo_collection}[/bold green]"
+                )
+                stats = self.migrate_collection(config)
+                all_stats[config.mongo_collection] = stats
+
+                # 显示单个集合的快速统计
+                if stats.processed_count > 0:
+                    success_rate = stats.success_count / stats.processed_count * 100
+                    if success_rate >= 95:
+                        status_emoji = "✅"
+                        status_color = "bright_green"
+                    elif success_rate >= 80:
+                        status_emoji = "⚠️"
+                        status_color = "yellow"
+                    else:
+                        status_emoji = "❌"
+                        status_color = "red"
+
+                    self.console.print(
+                        f"   {status_emoji} [{status_color}]完成: {stats.success_count}/{stats.processed_count} "
+                        f"({success_rate:.1f}%) 错误: {stats.error_count}[/{status_color}]"
+                    )
+
+                # 错误率检查
+                if stats.processed_count > 0:
+                    error_rate = stats.error_count / stats.processed_count
+                    if error_rate > 0.1:  # 错误率超过10%
+                        self.console.print(
+                            f"   [red]⚠️  警告: 错误率较高 {error_rate:.1%} "
+                            f"({stats.error_count}/{stats.processed_count})[/red]"
+                        )
+
+        finally:
+            self.disconnect_mongodb()
+
+        self._print_migration_summary(all_stats)
+        return all_stats
+
+    def _print_migration_summary(self, all_stats: Dict[str, MigrationStats]):
+        """使用Rich打印美观的迁移汇总信息"""
+        # 计算总体统计
+        total_processed = sum(stats.processed_count for stats in all_stats.values())
+        total_success = sum(stats.success_count for stats in all_stats.values())
+        total_errors = sum(stats.error_count for stats in all_stats.values())
+        total_skipped = sum(stats.skipped_count for stats in all_stats.values())
+        total_duplicates = sum(stats.duplicate_count for stats in all_stats.values())
+        total_validation_errors = sum(stats.validation_errors for stats in all_stats.values())
+        total_batch_inserts = sum(stats.batch_insert_count for stats in all_stats.values())
+
+        # 计算总耗时
+        total_duration_seconds = 0
+        for stats in all_stats.values():
+            if stats.start_time and stats.end_time:
+                duration = stats.end_time - stats.start_time
+                total_duration_seconds += duration.total_seconds()
+
+        # 创建详细统计表格
+        table = Table(title="[bold blue]数据迁移汇总报告[/bold blue]", show_header=True, header_style="bold magenta")
+        table.add_column("集合名称", style="cyan", width=20)
+        table.add_column("文档总数", justify="right", style="blue")
+        table.add_column("处理数量", justify="right", style="green")
+        table.add_column("成功数量", justify="right", style="green")
+        table.add_column("错误数量", justify="right", style="red")
+        table.add_column("跳过数量", justify="right", style="yellow")
+        table.add_column("重复数量", justify="right", style="bright_yellow")
+        table.add_column("验证错误", justify="right", style="red")
+        table.add_column("批次数", justify="right", style="purple")
+        table.add_column("成功率", justify="right", style="bright_green")
+        table.add_column("耗时(秒)", justify="right", style="blue")
+
+        for collection_name, stats in all_stats.items():
+            success_rate = (stats.success_count / stats.processed_count * 100) if stats.processed_count > 0 else 0
+            duration = 0
+            if stats.start_time and stats.end_time:
+                duration = (stats.end_time - stats.start_time).total_seconds()
+
+            # 根据成功率设置颜色
+            if success_rate >= 95:
+                success_rate_style = "[bright_green]"
+            elif success_rate >= 80:
+                success_rate_style = "[yellow]"
+            else:
+                success_rate_style = "[red]"
+
+            table.add_row(
+                collection_name,
+                str(stats.total_documents),
+                str(stats.processed_count),
+                str(stats.success_count),
+                f"[red]{stats.error_count}[/red]" if stats.error_count > 0 else "0",
+                f"[yellow]{stats.skipped_count}[/yellow]" if stats.skipped_count > 0 else "0",
+                f"[bright_yellow]{stats.duplicate_count}[/bright_yellow]" if stats.duplicate_count > 0 else "0",
+                f"[red]{stats.validation_errors}[/red]" if stats.validation_errors > 0 else "0",
+                str(stats.batch_insert_count),
+                f"{success_rate_style}{success_rate:.1f}%[/{success_rate_style[1:]}",
+                f"{duration:.2f}",
+            )
+
+        # 添加总计行
+        total_success_rate = (total_success / total_processed * 100) if total_processed > 0 else 0
+        if total_success_rate >= 95:
+            total_rate_style = "[bright_green]"
+        elif total_success_rate >= 80:
+            total_rate_style = "[yellow]"
+        else:
+            total_rate_style = "[red]"
+
+        table.add_section()
+        table.add_row(
+            "[bold]总计[/bold]",
+            f"[bold]{sum(stats.total_documents for stats in all_stats.values())}[/bold]",
+            f"[bold]{total_processed}[/bold]",
+            f"[bold]{total_success}[/bold]",
+            f"[bold red]{total_errors}[/bold red]" if total_errors > 0 else "[bold]0[/bold]",
+            f"[bold yellow]{total_skipped}[/bold yellow]" if total_skipped > 0 else "[bold]0[/bold]",
+            f"[bold bright_yellow]{total_duplicates}[/bold bright_yellow]"
+            if total_duplicates > 0
+            else "[bold]0[/bold]",
+            f"[bold red]{total_validation_errors}[/bold red]" if total_validation_errors > 0 else "[bold]0[/bold]",
+            f"[bold]{total_batch_inserts}[/bold]",
+            f"[bold]{total_rate_style}{total_success_rate:.1f}%[/{total_rate_style[1:]}[/bold]",
+            f"[bold]{total_duration_seconds:.2f}[/bold]",
+        )
+
+        self.console.print(table)
+
+        # 创建状态面板
+        status_items = []
+        if total_errors > 0:
+            status_items.append(f"[red]⚠️  发现 {total_errors} 个错误，请检查日志详情[/red]")
+
+        if total_validation_errors > 0:
+            status_items.append(f"[red]🔍 数据验证失败: {total_validation_errors} 条记录[/red]")
+
+        if total_duplicates > 0:
+            status_items.append(f"[yellow]📋 跳过重复记录: {total_duplicates} 条[/yellow]")
+
+        if total_success_rate >= 95:
+            status_items.append(f"[bright_green]✅ 迁移成功率优秀: {total_success_rate:.1f}%[/bright_green]")
+        elif total_success_rate >= 80:
+            status_items.append(f"[yellow]⚡ 迁移成功率良好: {total_success_rate:.1f}%[/yellow]")
+        else:
+            status_items.append(f"[red]❌ 迁移成功率较低: {total_success_rate:.1f}%，需要检查[/red]")
+
+        if status_items:
+            status_panel = Panel(
+                "\n".join(status_items), title="[bold yellow]迁移状态总结[/bold yellow]", border_style="yellow"
+            )
+            self.console.print(status_panel)
+
+        # 性能统计面板
+        avg_speed = total_processed / total_duration_seconds if total_duration_seconds > 0 else 0
+        performance_info = (
+            f"[cyan]总处理时间:[/cyan] {total_duration_seconds:.2f} 秒\n"
+            f"[cyan]平均处理速度:[/cyan] {avg_speed:.1f} 条记录/秒\n"
+            f"[cyan]批量插入优化:[/cyan] 执行了 {total_batch_inserts} 次批量操作"
+        )
+
+        performance_panel = Panel(performance_info, title="[bold green]性能统计[/bold green]", border_style="green")
+        self.console.print(performance_panel)
+
+    def add_migration_config(self, config: MigrationConfig):
+        """添加新的迁移配置"""
+        self.migration_configs.append(config)
+
+    def migrate_single_collection(self, collection_name: str) -> Optional[MigrationStats]:
+        """迁移单个指定的集合"""
+        config = next((c for c in self.migration_configs if c.mongo_collection == collection_name), None)
+        if not config:
+            logger.error(f"未找到集合 {collection_name} 的迁移配置")
+            return None
+
+        if not self.connect_mongodb():
+            logger.error("无法连接到MongoDB")
+            return None
+
+        try:
+            stats = self.migrate_collection(config)
+            self._print_migration_summary({collection_name: stats})
+            return stats
+        finally:
+            self.disconnect_mongodb()
+
+    def export_error_report(self, all_stats: Dict[str, MigrationStats], filepath: str):
+        """导出错误报告"""
+        error_report = {
+            "timestamp": datetime.now().isoformat(),
+            "summary": {
+                collection: {
+                    "total": stats.total_documents,
+                    "processed": stats.processed_count,
+                    "success": stats.success_count,
+                    "errors": stats.error_count,
+                    "skipped": stats.skipped_count,
+                    "duplicates": stats.duplicate_count,
+                }
+                for collection, stats in all_stats.items()
+            },
+            "errors": {collection: stats.errors for collection, stats in all_stats.items() if stats.errors},
+        }
+
+        try:
+            with open(filepath, "w", encoding="utf-8") as f:
+                json.dump(error_report, f, ensure_ascii=False, indent=2)
+            logger.info(f"错误报告已导出到: {filepath}")
+        except Exception as e:
+            logger.error(f"导出错误报告失败: {e}")
+
+
+def main():
+    """主程序入口"""
+    migrator = MongoToSQLiteMigrator()
+
+    # 执行迁移
+    migration_results = migrator.migrate_all()
+
+    # 导出错误报告（如果有错误）
+    if any(stats.error_count > 0 for stats in migration_results.values()):
+        error_report_path = f"migration_errors_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+        migrator.export_error_report(migration_results, error_report_path)
+
+    logger.info("数据迁移完成！")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/raw_data_preprocessor.py
+++ b/scripts/raw_data_preprocessor.py
@@ -0,0 +1,75 @@
+import os
+from pathlib import Path
+import sys  # 新增系统模块导入
+from src.chat.knowledge.utils.hash import get_sha256
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from src.common.logger import get_logger
+
+logger = get_logger("lpmm")
+ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+RAW_DATA_PATH = os.path.join(ROOT_PATH, "data/lpmm_raw_data")
+# IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data/imported_lpmm_data")
+
+def _process_text_file(file_path):
+    """处理单个文本文件，返回段落列表"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        raw = f.read()
+
+    paragraphs = []
+    paragraph = ""
+    for line in raw.split("\n"):
+        if line.strip() == "":
+            if paragraph != "":
+                paragraphs.append(paragraph.strip())
+                paragraph = ""
+        else:
+            paragraph += line + "\n"
+
+    if paragraph != "":
+        paragraphs.append(paragraph.strip())
+
+    return paragraphs
+
+
+def _process_multi_files() -> list:
+    raw_files = list(Path(RAW_DATA_PATH).glob("*.txt"))
+    if not raw_files:
+        logger.warning("警告: data/lpmm_raw_data 中没有找到任何 .txt 文件")
+        sys.exit(1)
+    # 处理所有文件
+    all_paragraphs = []
+    for file in raw_files:
+        logger.info(f"正在处理文件: {file.name}")
+        paragraphs = _process_text_file(file)
+        all_paragraphs.extend(paragraphs)
+    return all_paragraphs
+
+def load_raw_data() -> tuple[list[str], list[str]]:
+    """加载原始数据文件
+
+    读取原始数据文件，将原始数据加载到内存中
+
+    Args:
+        path: 可选，指定要读取的json文件绝对路径
+
+    Returns:
+        - raw_data: 原始数据列表
+        - sha256_list: 原始数据的SHA256集合
+    """
+    raw_data = _process_multi_files()
+    sha256_list = []
+    sha256_set = set()
+    for item in raw_data:
+        if not isinstance(item, str):
+            logger.warning(f"数据类型错误：{item}")
+            continue
+        pg_hash = get_sha256(item)
+        if pg_hash in sha256_set:
+            logger.warning(f"重复数据：{item}")
+            continue
+        sha256_set.add(pg_hash)
+        sha256_list.append(pg_hash)
+        raw_data.append(item)
+    logger.info(f"共读取到{len(raw_data)}条数据")
+
+    return sha256_list, raw_data
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -0,0 +1,556 @@
+#!/bin/bash
+
+# MaiCore & NapCat Adapter一键安装脚本 by Cookie_987
+# 适用于Arch/Ubuntu 24.10/Debian 12/CentOS 9
+# 请小心使用任何一键脚本！
+
+INSTALLER_VERSION="0.0.5-refactor"
+LANG=C.UTF-8
+
+# 如无法访问GitHub请修改此处镜像地址
+GITHUB_REPO="https://ghfast.top/https://github.com"
+
+# 颜色输出
+GREEN="\e[32m"
+RED="\e[31m"
+RESET="\e[0m"
+
+# 需要的基本软件包
+
+declare -A REQUIRED_PACKAGES=(
+    ["common"]="git sudo python3 curl gnupg"
+    ["debian"]="python3-venv python3-pip build-essential"
+    ["ubuntu"]="python3-venv python3-pip build-essential"
+    ["centos"]="epel-release python3-pip python3-devel gcc gcc-c++ make"
+    ["arch"]="python-virtualenv python-pip base-devel"
+)
+
+# 默认项目目录
+DEFAULT_INSTALL_DIR="/opt/maicore"
+
+# 服务名称
+SERVICE_NAME="maicore"
+SERVICE_NAME_WEB="maicore-web"
+SERVICE_NAME_NBADAPTER="maibot-napcat-adapter"
+
+IS_INSTALL_NAPCAT=false
+IS_INSTALL_DEPENDENCIES=false
+
+# 检查是否已安装
+check_installed() {
+    [[ -f /etc/systemd/system/${SERVICE_NAME}.service ]]
+}
+
+# 加载安装信息
+load_install_info() {
+    if [[ -f /etc/maicore_install.conf ]]; then
+        source /etc/maicore_install.conf
+    else
+        INSTALL_DIR="$DEFAULT_INSTALL_DIR"
+        BRANCH="refactor"
+    fi
+}
+
+# 显示管理菜单
+show_menu() {
+    while true; do
+        choice=$(whiptail --title "MaiCore管理菜单" --menu "请选择要执行的操作：" 15 60 7 \
+            "1" "启动MaiCore" \
+            "2" "停止MaiCore" \
+            "3" "重启MaiCore" \
+            "4" "启动NapCat Adapter" \
+            "5" "停止NapCat Adapter" \
+            "6" "重启NapCat Adapter" \
+            "7" "拉取最新MaiCore仓库" \
+            "8" "切换分支" \
+            "9" "退出" 3>&1 1>&2 2>&3)
+
+        [[ $? -ne 0 ]] && exit 0
+
+        case "$choice" in
+            1)
+                systemctl start ${SERVICE_NAME}
+                whiptail --msgbox "✅MaiCore已启动" 10 60
+                ;;
+            2)
+                systemctl stop ${SERVICE_NAME}
+                whiptail --msgbox "🛑MaiCore已停止" 10 60
+                ;;
+            3)
+                systemctl restart ${SERVICE_NAME}
+                whiptail --msgbox "🔄MaiCore已重启" 10 60
+                ;;
+            4)
+                systemctl start ${SERVICE_NAME_NBADAPTER}
+                whiptail --msgbox "✅NapCat Adapter已启动" 10 60
+                ;;
+            5)
+                systemctl stop ${SERVICE_NAME_NBADAPTER}
+                whiptail --msgbox "🛑NapCat Adapter已停止" 10 60
+                ;;
+            6)
+                systemctl restart ${SERVICE_NAME_NBADAPTER}
+                whiptail --msgbox "🔄NapCat Adapter已重启" 10 60
+                ;;
+            7)
+                update_dependencies
+                ;;
+            8)
+                switch_branch
+                ;;
+            9)
+                exit 0
+                ;;
+            *)
+                whiptail --msgbox "无效选项！" 10 60
+                ;;
+        esac
+    done
+}
+
+# 更新依赖
+update_dependencies() {
+    whiptail --title "⚠" --msgbox "更新后请阅读教程" 10 60
+    systemctl stop ${SERVICE_NAME}
+    cd "${INSTALL_DIR}/MaiBot" || {
+        whiptail --msgbox "🚫 无法进入安装目录！" 10 60
+        return 1
+    }
+    if ! git pull origin "${BRANCH}"; then
+        whiptail --msgbox "🚫 代码更新失败！" 10 60
+        return 1
+    fi
+    source "${INSTALL_DIR}/venv/bin/activate"
+    if ! pip install -r requirements.txt; then
+        whiptail --msgbox "🚫 依赖安装失败！" 10 60
+        deactivate
+        return 1
+    fi
+    deactivate
+    whiptail --msgbox "✅ 已停止服务并拉取最新仓库提交" 10 60
+}
+
+# 切换分支
+switch_branch() {
+    new_branch=$(whiptail --inputbox "请输入要切换的分支名称：" 10 60 "${BRANCH}" 3>&1 1>&2 2>&3)
+    [[ -z "$new_branch" ]] && {
+        whiptail --msgbox "🚫 分支名称不能为空！" 10 60
+        return 1
+    }
+
+    cd "${INSTALL_DIR}/MaiBot" || {
+        whiptail --msgbox "🚫 无法进入安装目录！" 10 60
+        return 1
+    }
+
+    if ! git ls-remote --exit-code --heads origin "${new_branch}" >/dev/null 2>&1; then
+        whiptail --msgbox "🚫 分支 ${new_branch} 不存在！" 10 60
+        return 1
+    fi
+
+    if ! git checkout "${new_branch}"; then
+        whiptail --msgbox "🚫 分支切换失败！" 10 60
+        return 1
+    fi
+
+    if ! git pull origin "${new_branch}"; then
+        whiptail --msgbox "🚫 代码拉取失败！" 10 60
+        return 1
+    fi
+    systemctl stop ${SERVICE_NAME}
+    source "${INSTALL_DIR}/venv/bin/activate"
+    pip install -r requirements.txt
+    deactivate
+
+    sed -i "s/^BRANCH=.*/BRANCH=${new_branch}/" /etc/maicore_install.conf
+    BRANCH="${new_branch}"
+    check_eula
+    whiptail --msgbox "✅ 已停止服务并切换到分支 ${new_branch} ！" 10 60
+}
+
+check_eula() {
+    # 首先计算当前EULA的MD5值
+    current_md5=$(md5sum "${INSTALL_DIR}/MaiBot/EULA.md" | awk '{print $1}')
+
+    # 首先计算当前隐私条款文件的哈希值
+    current_md5_privacy=$(md5sum "${INSTALL_DIR}/MaiBot/PRIVACY.md" | awk '{print $1}')
+
+    # 如果当前的md5值为空，则直接返回
+    if [[ -z $current_md5 || -z $current_md5_privacy ]]; then
+        whiptail --msgbox "🚫 未找到使用协议\n 请检查PRIVACY.md和EULA.md是否存在" 10 60
+    fi
+
+    # 检查eula.confirmed文件是否存在
+    if [[ -f ${INSTALL_DIR}/MaiBot/eula.confirmed ]]; then
+        # 如果存在则检查其中包含的md5与current_md5是否一致
+        confirmed_md5=$(cat ${INSTALL_DIR}/MaiBot/eula.confirmed)
+    else
+        confirmed_md5=""
+    fi
+
+    # 检查privacy.confirmed文件是否存在
+    if [[ -f ${INSTALL_DIR}/MaiBot/privacy.confirmed ]]; then
+        # 如果存在则检查其中包含的md5与current_md5是否一致
+        confirmed_md5_privacy=$(cat ${INSTALL_DIR}/MaiBot/privacy.confirmed)
+    else
+        confirmed_md5_privacy=""
+    fi
+
+    # 如果EULA或隐私条款有更新，提示用户重新确认
+    if [[ $current_md5 != $confirmed_md5 || $current_md5_privacy != $confirmed_md5_privacy ]]; then
+        whiptail --title "📜 使用协议更新" --yesno "检测到MaiCore EULA或隐私条款已更新。\nhttps://github.com/MaiM-with-u/MaiBot/blob/refactor/EULA.md\nhttps://github.com/MaiM-with-u/MaiBot/blob/refactor/PRIVACY.md\n\n您是否同意上述协议？ \n\n " 12 70
+        if [[ $? -eq 0 ]]; then
+            echo -n $current_md5 > ${INSTALL_DIR}/MaiBot/eula.confirmed
+            echo -n $current_md5_privacy > ${INSTALL_DIR}/MaiBot/privacy.confirmed
+        else
+            exit 1
+        fi
+    fi
+
+}
+
+# ----------- 主安装流程 -----------
+run_installation() {
+    # 1/6: 检测是否安装 whiptail
+    if ! command -v whiptail &>/dev/null; then
+        echo -e "${RED}[1/6] whiptail 未安装，正在安装...${RESET}"
+
+        if command -v apt-get &>/dev/null; then
+            apt-get update && apt-get install -y whiptail
+        elif command -v pacman &>/dev/null; then
+            pacman -Syu --noconfirm whiptail
+        elif command -v yum &>/dev/null; then
+            yum install -y whiptail
+        else
+            echo -e "${RED}[Error] 无受支持的包管理器，无法安装 whiptail!${RESET}"
+            exit 1
+        fi
+    fi
+
+    whiptail --title "ℹ️ 提示" --msgbox "如果您没有特殊需求，请优先使用docker方式部署。" 10 60
+
+    # 协议确认
+    if ! (whiptail --title "ℹ️ [1/6] 使用协议" --yes-button "我同意" --no-button "我拒绝" --yesno "使用MaiCore及此脚本前请先阅读EULA协议及隐私协议\nhttps://github.com/MaiM-with-u/MaiBot/blob/refactor/EULA.md\nhttps://github.com/MaiM-with-u/MaiBot/blob/refactor/PRIVACY.md\n\n您是否同意上述协议？" 12 70); then
+        exit 1
+    fi
+
+    # 欢迎信息
+    whiptail --title "[2/6] 欢迎使用MaiCore一键安装脚本 by Cookie987" --msgbox "检测到您未安装MaiCore，将自动进入安装流程，安装完成后再次运行此脚本即可进入管理菜单。\n\n项目处于活跃开发阶段，代码可能随时更改\n文档未完善，有问题可以提交 Issue 或者 Discussion\nQQ机器人存在被限制风险，请自行了解，谨慎使用\n由于持续迭代，可能存在一些已知或未知的bug\n由于开发中，可能消耗较多token\n\n本脚本可能更新不及时，如遇到bug请优先尝试手动部署以确定是否为脚本问题" 17 60
+
+    # 系统检查
+    check_system() {
+        if [[ "$(id -u)" -ne 0 ]]; then
+            whiptail --title "🚫 权限不足" --msgbox "请使用 root 用户运行此脚本！\n执行方式: sudo bash $0" 10 60
+            exit 1
+        fi
+
+        if [[ -f /etc/os-release ]]; then
+            source /etc/os-release
+            if [[ "$ID" == "debian" && "$VERSION_ID" == "12" ]]; then
+                return
+            elif [[ "$ID" == "ubuntu" && "$VERSION_ID" == "24.10" ]]; then
+                return
+            elif [[ "$ID" == "centos" && "$VERSION_ID" == "9" ]]; then
+                return
+            elif [[ "$ID" == "arch" ]]; then
+                whiptail --title "⚠️ 兼容性警告" --msgbox "NapCat无可用的 Arch Linux 官方安装方法，将无法自动安装NapCat。\n\n您可尝试在AUR中搜索相关包。" 10 60
+                return
+            else
+                whiptail --title "🚫 不支持的系统" --msgbox "此脚本仅支持 Arch/Debian 12 (Bookworm)/Ubuntu 24.10 (Oracular Oriole)/CentOS9！\n当前系统: $PRETTY_NAME\n安装已终止。" 10 60
+                exit 1
+            fi
+        else
+            whiptail --title "⚠️ 无法检测系统" --msgbox "无法识别系统版本，安装已终止。" 10 60
+            exit 1
+        fi
+    }
+    check_system
+
+    # 设置包管理器
+    case "$ID" in
+        debian|ubuntu)
+            PKG_MANAGER="apt"
+            ;;
+        centos)
+            PKG_MANAGER="yum"
+            ;;
+        arch)  
+            # 添加arch包管理器
+            PKG_MANAGER="pacman"
+            ;;
+    esac
+
+    # 检查NapCat
+    check_napcat() {
+        if command -v napcat &>/dev/null; then
+            NAPCAT_INSTALLED=true
+        else
+            NAPCAT_INSTALLED=false
+        fi
+    }
+    check_napcat
+
+    # 安装必要软件包
+    install_packages() {
+        missing_packages=()
+        # 检查 common 及当前系统专属依赖
+        for package in ${REQUIRED_PACKAGES["common"]} ${REQUIRED_PACKAGES["$ID"]}; do
+            case "$PKG_MANAGER" in
+            apt)
+                dpkg -s "$package" &>/dev/null || missing_packages+=("$package")
+                ;;
+            yum)
+                rpm -q "$package" &>/dev/null || missing_packages+=("$package")
+                ;;
+            pacman)
+                pacman -Qi "$package" &>/dev/null || missing_packages+=("$package")
+                ;;
+            esac
+        done
+
+        if [[ ${#missing_packages[@]} -gt 0 ]]; then
+            whiptail --title "📦 [3/6] 依赖检查" --yesno "以下软件包缺失:\n${missing_packages[*]}\n\n是否自动安装？" 10 60
+            if [[ $? -eq 0 ]]; then
+                IS_INSTALL_DEPENDENCIES=true
+            else
+                whiptail --title "⚠️ 注意" --yesno "未安装某些依赖，可能影响运行！\n是否继续？" 10 60 || exit 1
+            fi
+        fi
+    }
+    install_packages
+       
+    # 安装NapCat
+    install_napcat() {
+        [[ $NAPCAT_INSTALLED == true ]] && return
+        whiptail --title "📦 [3/6] 软件包检查" --yesno "检测到未安装NapCat，是否安装？\n如果您想使用远程NapCat，请跳过此步。" 10 60 && {
+            IS_INSTALL_NAPCAT=true
+        }
+    }
+
+    # 仅在非Arch系统上安装NapCat
+    [[ "$ID" != "arch" ]] && install_napcat
+
+    # Python版本检查
+    check_python() {
+        PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+        if ! python3 -c "import sys; exit(0) if sys.version_info >= (3,10) else exit(1)"; then
+            whiptail --title "⚠️ [4/6] Python 版本过低" --msgbox "检测到 Python 版本为 $PYTHON_VERSION，需要 3.10 或以上！\n请升级 Python 后重新运行本脚本。" 10 60
+            exit 1
+        fi
+    }
+
+    # 如果没安装python则不检查python版本
+    if command -v python3 &>/dev/null; then
+        check_python
+    fi
+    
+
+    # 选择分支
+    choose_branch() {
+    BRANCH=$(whiptail --title "🔀 选择分支" --radiolist "请选择要安装的分支：" 15 60 4 \
+        "main" "稳定版本（推荐）" ON \
+        "dev" "开发版（不知道什么意思就别选）" OFF \
+        "classical" "经典版（0.6.0以前的版本）" OFF \
+        "custom" "自定义分支" OFF 3>&1 1>&2 2>&3)
+    RETVAL=$?
+    if [ $RETVAL -ne 0 ]; then
+        whiptail --msgbox "🚫 操作取消！" 10 60
+        exit 1
+    fi
+
+    if [[ "$BRANCH" == "custom" ]]; then
+        BRANCH=$(whiptail --title "🔀 自定义分支" --inputbox "请输入自定义分支名称：" 10 60 "refactor" 3>&1 1>&2 2>&3)
+        RETVAL=$?
+        if [ $RETVAL -ne 0 ]; then
+            whiptail --msgbox "🚫 输入取消！" 10 60
+            exit 1
+        fi
+        if [[ -z "$BRANCH" ]]; then
+            whiptail --msgbox "🚫 分支名称不能为空！" 10 60
+            exit 1
+        fi
+    fi
+    }
+    choose_branch
+
+    # 选择安装路径
+    choose_install_dir() {
+        INSTALL_DIR=$(whiptail --title "📂 [6/6] 选择安装路径" --inputbox "请输入MaiCore的安装目录：" 10 60 "$DEFAULT_INSTALL_DIR" 3>&1 1>&2 2>&3)
+        [[ -z "$INSTALL_DIR" ]] && {
+            whiptail --title "⚠️ 取消输入" --yesno "未输入安装路径，是否退出安装？" 10 60 && exit 1
+            INSTALL_DIR="$DEFAULT_INSTALL_DIR"
+        }
+    }
+    choose_install_dir
+
+    # 确认安装
+    confirm_install() {
+        local confirm_msg="请确认以下更改：\n\n"
+        confirm_msg+="📂 安装MaiCore、NapCat Adapter到: $INSTALL_DIR\n"
+        confirm_msg+="🔀 分支: $BRANCH\n"
+        [[ $IS_INSTALL_DEPENDENCIES == true ]] && confirm_msg+="📦 安装依赖：${missing_packages[@]}\n"
+        [[ $IS_INSTALL_NAPCAT == true ]] && confirm_msg+="📦 安装额外组件：\n"
+
+        [[ $IS_INSTALL_NAPCAT == true ]] && confirm_msg+="  - NapCat\n"
+        confirm_msg+="\n注意：本脚本默认使用ghfast.top为GitHub进行加速，如不想使用请手动修改脚本开头的GITHUB_REPO变量。"
+
+        whiptail --title "🔧 安装确认" --yesno "$confirm_msg" 20 60 || exit 1
+    }
+    confirm_install
+
+    # 开始安装
+    echo -e "${GREEN}安装${missing_packages[@]}...${RESET}"
+    
+    if [[ $IS_INSTALL_DEPENDENCIES == true ]]; then
+        case "$PKG_MANAGER" in
+        apt)
+            apt update && apt install -y "${missing_packages[@]}"
+            ;;
+        yum)
+            yum install -y "${missing_packages[@]}" --nobest
+            ;;
+        pacman)
+            pacman -S --noconfirm "${missing_packages[@]}"
+            ;;
+        esac
+    fi
+
+    if [[ $IS_INSTALL_NAPCAT == true ]]; then
+        echo -e "${GREEN}安装 NapCat...${RESET}"
+        curl -o napcat.sh https://nclatest.znin.net/NapNeko/NapCat-Installer/main/script/install.sh && bash napcat.sh --cli y --docker n
+    fi
+
+    echo -e "${GREEN}创建安装目录...${RESET}"
+    mkdir -p "$INSTALL_DIR"
+    cd "$INSTALL_DIR" || exit 1
+
+    echo -e "${GREEN}设置Python虚拟环境...${RESET}"
+    python3 -m venv venv
+    source venv/bin/activate
+
+    echo -e "${GREEN}克隆MaiCore仓库...${RESET}"
+    git clone -b "$BRANCH" "$GITHUB_REPO/MaiM-with-u/MaiBot" MaiBot || {
+        echo -e "${RED}克隆MaiCore仓库失败！${RESET}"
+        exit 1
+    }
+
+    echo -e "${GREEN}克隆 maim_message 包仓库...${RESET}"
+    git clone $GITHUB_REPO/MaiM-with-u/maim_message.git || {
+        echo -e "${RED}克隆 maim_message 包仓库失败！${RESET}"
+        exit 1
+    }
+
+    echo -e "${GREEN}克隆 nonebot-plugin-maibot-adapters 仓库...${RESET}"
+    git clone $GITHUB_REPO/MaiM-with-u/MaiBot-Napcat-Adapter.git || {
+        echo -e "${RED}克隆 MaiBot-Napcat-Adapter.git 仓库失败！${RESET}"
+        exit 1
+    }
+
+
+    echo -e "${GREEN}安装Python依赖...${RESET}"
+    pip install -r MaiBot/requirements.txt
+    cd MaiBot
+    pip install uv
+    uv pip install -i https://mirrors.aliyun.com/pypi/simple -r requirements.txt   
+    cd ..
+
+    echo -e "${GREEN}安装maim_message依赖...${RESET}"
+    cd maim_message
+    uv pip install -i https://mirrors.aliyun.com/pypi/simple -e .
+    cd ..
+
+    echo -e "${GREEN}部署MaiBot Napcat Adapter...${RESET}"
+    cd MaiBot-Napcat-Adapter
+    uv pip install -i https://mirrors.aliyun.com/pypi/simple -r requirements.txt
+    cd ..
+
+    echo -e "${GREEN}同意协议...${RESET}"
+
+    # 首先计算当前EULA的MD5值
+    current_md5=$(md5sum "MaiBot/EULA.md" | awk '{print $1}')
+
+    # 首先计算当前隐私条款文件的哈希值
+    current_md5_privacy=$(md5sum "MaiBot/PRIVACY.md" | awk '{print $1}')
+
+    echo -n $current_md5 > MaiBot/eula.confirmed
+    echo -n $current_md5_privacy > MaiBot/privacy.confirmed
+
+    echo -e "${GREEN}创建系统服务...${RESET}"
+    cat > /etc/systemd/system/${SERVICE_NAME}.service <<EOF
+[Unit]
+Description=MaiCore
+After=network.target ${SERVICE_NAME_NBADAPTER}.service
+
+[Service]
+Type=simple
+WorkingDirectory=${INSTALL_DIR}/MaiBot
+ExecStart=$INSTALL_DIR/venv/bin/python3 bot.py
+Restart=always
+RestartSec=10s
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+#     cat > /etc/systemd/system/${SERVICE_NAME_WEB}.service <<EOF
+# [Unit]
+# Description=MaiCore WebUI
+# After=network.target ${SERVICE_NAME}.service
+
+# [Service]
+# Type=simple
+# WorkingDirectory=${INSTALL_DIR}/MaiBot
+# ExecStart=$INSTALL_DIR/venv/bin/python3 webui.py
+# Restart=always
+# RestartSec=10s
+
+# [Install]
+# WantedBy=multi-user.target
+# EOF
+
+    cat > /etc/systemd/system/${SERVICE_NAME_NBADAPTER}.service <<EOF
+[Unit]
+Description=MaiBot Napcat Adapter
+After=network.target mongod.service ${SERVICE_NAME}.service
+
+[Service]
+Type=simple
+WorkingDirectory=${INSTALL_DIR}/MaiBot-Napcat-Adapter
+ExecStart=$INSTALL_DIR/venv/bin/python3 main.py
+Restart=always
+RestartSec=10s
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+    systemctl daemon-reload
+
+    # 保存安装信息
+    echo "INSTALLER_VERSION=${INSTALLER_VERSION}" > /etc/maicore_install.conf
+    echo "INSTALL_DIR=${INSTALL_DIR}" >> /etc/maicore_install.conf
+    echo "BRANCH=${BRANCH}" >> /etc/maicore_install.conf
+
+    whiptail --title "🎉 安装完成" --msgbox "MaiCore安装完成！\n已创建系统服务：${SERVICE_NAME}、${SERVICE_NAME_WEB}、${SERVICE_NAME_NBADAPTER}\n\n使用以下命令管理服务：\n启动服务：systemctl start ${SERVICE_NAME}\n查看状态：systemctl status ${SERVICE_NAME}" 14 60
+}
+
+# ----------- 主执行流程 -----------
+# 检查root权限
+[[ $(id -u) -ne 0 ]] && {
+    echo -e "${RED}请使用root用户运行此脚本！${RESET}"
+    exit 1
+}
+
+# 如果已安装显示菜单，并检查协议是否更新
+if check_installed; then
+    load_install_info
+    check_eula
+    show_menu
+else
+    run_installation
+    # 安装完成后询问是否启动
+    if whiptail --title "安装完成" --yesno "是否立即启动MaiCore服务？" 10 60; then
+        systemctl start ${SERVICE_NAME}
+        whiptail --msgbox "✅ 服务已启动！\n使用 systemctl status ${SERVICE_NAME} 查看状态" 10 60
+    fi
+fi
--- a/scripts/run_lpmm.sh
+++ b/scripts/run_lpmm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# ==============================================
+# Environment Initialization
+# ==============================================
+
+# Step 1: Locate project root directory
+SCRIPTS_DIR="scripts"
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+PROJECT_ROOT=$(cd "$SCRIPT_DIR/.." && pwd)
+
+# Step 2: Verify scripts directory exists
+if [ ! -d "$PROJECT_ROOT/$SCRIPTS_DIR" ]; then
+    echo "❌ Error: scripts directory not found in project root" >&2
+    echo "Current path: $PROJECT_ROOT" >&2
+    exit 1
+fi
+
+# Step 3: Set up Python environment
+export PYTHONPATH="$PROJECT_ROOT:$PYTHONPATH"
+cd "$PROJECT_ROOT" || {
+    echo "❌ Failed to cd to project root: $PROJECT_ROOT" >&2
+    exit 1
+}
+
+# Debug info
+echo "============================"
+echo "Project Root: $PROJECT_ROOT"
+echo "Python Path: $PYTHONPATH"
+echo "Working Dir: $(pwd)"
+echo "============================"
+
+# ==============================================
+# Python Script Execution
+# ==============================================
+
+run_python_script() {
+    local script_name=$1
+    echo "🔄 Running $script_name"
+    if ! python3 "$SCRIPTS_DIR/$script_name"; then
+        echo "❌ $script_name failed" >&2
+        exit 1
+    fi
+}
+
+# Execute scripts in order
+run_python_script "raw_data_preprocessor.py"
+run_python_script "info_extraction.py"
+run_python_script "import_openie.py"
+
+echo "✅ All scripts completed successfully"
--- a/scripts/text_length_analysis.py
+++ b/scripts/text_length_analysis.py
@@ -0,0 +1,394 @@
+import time
+import sys
+import os
+import re
+from typing import Dict, List, Tuple, Optional
+from datetime import datetime
+# Add project root to Python path
+project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, project_root)
+from src.common.database.database_model import Messages, ChatStreams #noqa
+
+
+def contains_emoji_or_image_tags(text: str) -> bool:
+    """Check if text contains [表情包xxxxx] or [图片xxxxx] tags"""
+    if not text:
+        return False
+    
+    # 检查是否包含 [表情包] 或 [图片] 标记
+    emoji_pattern = r'\[表情包[^\]]*\]'
+    image_pattern = r'\[图片[^\]]*\]'
+    
+    return bool(re.search(emoji_pattern, text) or re.search(image_pattern, text))
+
+
+def clean_reply_text(text: str) -> str:
+    """Remove reply references like [回复 xxxx...] from text"""
+    if not text:
+        return text
+    
+    # 匹配 [回复 xxxx...] 格式的内容
+    # 使用非贪婪匹配，匹配到第一个 ] 就停止
+    cleaned_text = re.sub(r'\[回复[^\]]*\]', '', text)
+    
+    # 去除多余的空白字符
+    cleaned_text = cleaned_text.strip()
+    
+    return cleaned_text
+
+
+def get_chat_name(chat_id: str) -> str:
+    """Get chat name from chat_id by querying ChatStreams table directly"""
+    try:
+        chat_stream = ChatStreams.get_or_none(ChatStreams.stream_id == chat_id)
+        if chat_stream is None:
+            return f"未知聊天 ({chat_id})"
+        
+        if chat_stream.group_name:
+            return f"{chat_stream.group_name} ({chat_id})"
+        elif chat_stream.user_nickname:
+            return f"{chat_stream.user_nickname}的私聊 ({chat_id})"
+        else:
+            return f"未知聊天 ({chat_id})"
+    except Exception:
+        return f"查询失败 ({chat_id})"
+
+
+def format_timestamp(timestamp: float) -> str:
+    """Format timestamp to readable date string"""
+    try:
+        return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
+    except (ValueError, OSError):
+        return "未知时间"
+
+
+def calculate_text_length_distribution(messages) -> Dict[str, int]:
+    """Calculate distribution of processed_plain_text length"""
+    distribution = {
+        '0': 0,           # 空文本
+        '1-5': 0,         # 极短文本
+        '6-10': 0,        # 很短文本
+        '11-20': 0,       # 短文本
+        '21-30': 0,       # 较短文本
+        '31-50': 0,       # 中短文本
+        '51-70': 0,       # 中等文本
+        '71-100': 0,      # 较长文本
+        '101-150': 0,     # 长文本
+        '151-200': 0,     # 很长文本
+        '201-300': 0,     # 超长文本
+        '301-500': 0,     # 极长文本
+        '501-1000': 0,    # 巨长文本
+        '1000+': 0        # 超巨长文本
+    }
+    
+    for msg in messages:
+        if msg.processed_plain_text is None:
+            continue
+        
+        # 排除包含表情包或图片标记的消息
+        if contains_emoji_or_image_tags(msg.processed_plain_text):
+            continue
+        
+        # 清理文本中的回复引用
+        cleaned_text = clean_reply_text(msg.processed_plain_text)
+        length = len(cleaned_text)
+        
+        if length == 0:
+            distribution['0'] += 1
+        elif length <= 5:
+            distribution['1-5'] += 1
+        elif length <= 10:
+            distribution['6-10'] += 1
+        elif length <= 20:
+            distribution['11-20'] += 1
+        elif length <= 30:
+            distribution['21-30'] += 1
+        elif length <= 50:
+            distribution['31-50'] += 1
+        elif length <= 70:
+            distribution['51-70'] += 1
+        elif length <= 100:
+            distribution['71-100'] += 1
+        elif length <= 150:
+            distribution['101-150'] += 1
+        elif length <= 200:
+            distribution['151-200'] += 1
+        elif length <= 300:
+            distribution['201-300'] += 1
+        elif length <= 500:
+            distribution['301-500'] += 1
+        elif length <= 1000:
+            distribution['501-1000'] += 1
+        else:
+            distribution['1000+'] += 1
+    
+    return distribution
+
+
+def get_text_length_stats(messages) -> Dict[str, float]:
+    """Calculate basic statistics for processed_plain_text length"""
+    lengths = []
+    null_count = 0
+    excluded_count = 0  # 被排除的消息数量
+    
+    for msg in messages:
+        if msg.processed_plain_text is None:
+            null_count += 1
+        elif contains_emoji_or_image_tags(msg.processed_plain_text):
+            # 排除包含表情包或图片标记的消息
+            excluded_count += 1
+        else:
+            # 清理文本中的回复引用
+            cleaned_text = clean_reply_text(msg.processed_plain_text)
+            lengths.append(len(cleaned_text))
+    
+    if not lengths:
+        return {
+            'count': 0,
+            'null_count': null_count,
+            'excluded_count': excluded_count,
+            'min': 0,
+            'max': 0,
+            'avg': 0,
+            'median': 0
+        }
+    
+    lengths.sort()
+    count = len(lengths)
+    
+    return {
+        'count': count,
+        'null_count': null_count,
+        'excluded_count': excluded_count,
+        'min': min(lengths),
+        'max': max(lengths),
+        'avg': sum(lengths) / count,
+        'median': lengths[count // 2] if count % 2 == 1 else (lengths[count // 2 - 1] + lengths[count // 2]) / 2
+    }
+
+
+def get_available_chats() -> List[Tuple[str, str, int]]:
+    """Get all available chats with message counts"""
+    try:
+        # 获取所有有消息的chat_id，排除特殊类型消息
+        chat_counts = {}
+        for msg in Messages.select(Messages.chat_id).distinct():
+            chat_id = msg.chat_id
+            count = Messages.select().where(
+                (Messages.chat_id == chat_id) &
+                (Messages.is_emoji != 1) &
+                (Messages.is_picid != 1) &
+                (Messages.is_command != 1)
+            ).count()
+            if count > 0:
+                chat_counts[chat_id] = count
+        
+        # 获取聊天名称
+        result = []
+        for chat_id, count in chat_counts.items():
+            chat_name = get_chat_name(chat_id)
+            result.append((chat_id, chat_name, count))
+        
+        # 按消息数量排序
+        result.sort(key=lambda x: x[2], reverse=True)
+        return result
+    except Exception as e:
+        print(f"获取聊天列表失败: {e}")
+        return []
+
+
+def get_time_range_input() -> Tuple[Optional[float], Optional[float]]:
+    """Get time range input from user"""
+    print("\n时间范围选择:")
+    print("1. 最近1天")
+    print("2. 最近3天") 
+    print("3. 最近7天")
+    print("4. 最近30天")
+    print("5. 自定义时间范围")
+    print("6. 不限制时间")
+    
+    choice = input("请选择时间范围 (1-6): ").strip()
+    
+    now = time.time()
+    
+    if choice == "1":
+        return now - 24*3600, now
+    elif choice == "2":
+        return now - 3*24*3600, now
+    elif choice == "3":
+        return now - 7*24*3600, now
+    elif choice == "4":
+        return now - 30*24*3600, now
+    elif choice == "5":
+        print("请输入开始时间 (格式: YYYY-MM-DD HH:MM:SS):")
+        start_str = input().strip()
+        print("请输入结束时间 (格式: YYYY-MM-DD HH:MM:SS):")
+        end_str = input().strip()
+        
+        try:
+            start_time = datetime.strptime(start_str, "%Y-%m-%d %H:%M:%S").timestamp()
+            end_time = datetime.strptime(end_str, "%Y-%m-%d %H:%M:%S").timestamp()
+            return start_time, end_time
+        except ValueError:
+            print("时间格式错误，将不限制时间范围")
+            return None, None
+    else:
+        return None, None
+
+
+def get_top_longest_messages(messages, top_n: int = 10) -> List[Tuple[str, int, str, str]]:
+    """Get top N longest messages"""
+    message_lengths = []
+    
+    for msg in messages:
+        if msg.processed_plain_text is not None:
+            # 排除包含表情包或图片标记的消息
+            if contains_emoji_or_image_tags(msg.processed_plain_text):
+                continue
+            
+            # 清理文本中的回复引用
+            cleaned_text = clean_reply_text(msg.processed_plain_text)
+            length = len(cleaned_text)
+            chat_name = get_chat_name(msg.chat_id)
+            time_str = format_timestamp(msg.time)
+            # 截取前100个字符作为预览
+            preview = cleaned_text[:100] + "..." if len(cleaned_text) > 100 else cleaned_text
+            message_lengths.append((chat_name, length, time_str, preview))
+    
+    # 按长度排序，取前N个
+    message_lengths.sort(key=lambda x: x[1], reverse=True)
+    return message_lengths[:top_n]
+
+
+def analyze_text_lengths(chat_id: Optional[str] = None, start_time: Optional[float] = None, end_time: Optional[float] = None) -> None:
+    """Analyze processed_plain_text lengths with optional filters"""
+    
+    # 构建查询条件，排除特殊类型的消息
+    query = Messages.select().where(
+        (Messages.is_emoji != 1) &
+        (Messages.is_picid != 1) &
+        (Messages.is_command != 1)
+    )
+    
+    if chat_id:
+        query = query.where(Messages.chat_id == chat_id)
+    
+    if start_time:
+        query = query.where(Messages.time >= start_time)
+    
+    if end_time:
+        query = query.where(Messages.time <= end_time)
+    
+    messages = list(query)
+    
+    if not messages:
+        print("没有找到符合条件的消息")
+        return
+    
+    # 计算统计信息
+    distribution = calculate_text_length_distribution(messages)
+    stats = get_text_length_stats(messages)
+    top_longest = get_top_longest_messages(messages, 10)
+    
+    # 显示结果
+    print("\n=== Processed Plain Text 长度分析结果 ===")
+    print("(已排除表情、图片ID、命令类型消息，已排除[表情包]和[图片]标记消息，已清理回复引用)")
+    if chat_id:
+        print(f"聊天: {get_chat_name(chat_id)}")
+    else:
+        print("聊天: 全部聊天")
+    
+    if start_time and end_time:
+        print(f"时间范围: {format_timestamp(start_time)} 到 {format_timestamp(end_time)}")
+    elif start_time:
+        print(f"时间范围: {format_timestamp(start_time)} 之后")
+    elif end_time:
+        print(f"时间范围: {format_timestamp(end_time)} 之前")
+    else:
+        print("时间范围: 不限制")
+    
+    print("\n基本统计:")
+    print(f"总消息数量: {len(messages)}")
+    print(f"有文本消息数量: {stats['count']}")
+    print(f"空文本消息数量: {stats['null_count']}")
+    print(f"被排除的消息数量: {stats['excluded_count']}")
+    if stats['count'] > 0:
+        print(f"最短长度: {stats['min']} 字符")
+        print(f"最长长度: {stats['max']} 字符")
+        print(f"平均长度: {stats['avg']:.2f} 字符")
+        print(f"中位数长度: {stats['median']:.2f} 字符")
+    
+    print("\n文本长度分布:")
+    total = stats['count']
+    if total > 0:
+        for range_name, count in distribution.items():
+            if count > 0:
+                percentage = count / total * 100
+                print(f"{range_name} 字符: {count} ({percentage:.2f}%)")
+    
+    # 显示最长的消息
+    if top_longest:
+        print(f"\n最长的 {len(top_longest)} 条消息:")
+        for i, (chat_name, length, time_str, preview) in enumerate(top_longest, 1):
+            print(f"{i}. [{chat_name}] {time_str}")
+            print(f"   长度: {length} 字符")
+            print(f"   预览: {preview}")
+            print()
+
+
+def interactive_menu() -> None:
+    """Interactive menu for text length analysis"""
+    
+    while True:
+        print("\n" + "="*50)
+        print("Processed Plain Text 长度分析工具")
+        print("="*50)
+        print("1. 分析全部聊天")
+        print("2. 选择特定聊天分析")
+        print("q. 退出")
+        
+        choice = input("\n请选择分析模式 (1-2, q): ").strip()
+        
+        if choice.lower() == 'q':
+            print("再见！")
+            break
+        
+        chat_id = None
+        
+        if choice == "2":
+            # 显示可用的聊天列表
+            chats = get_available_chats()
+            if not chats:
+                print("没有找到聊天数据")
+                continue
+            
+            print(f"\n可用的聊天 (共{len(chats)}个):")
+            for i, (_cid, name, count) in enumerate(chats, 1):
+                print(f"{i}. {name} ({count}条消息)")
+            
+            try:
+                chat_choice = int(input(f"\n请选择聊天 (1-{len(chats)}): ").strip())
+                if 1 <= chat_choice <= len(chats):
+                    chat_id = chats[chat_choice - 1][0]
+                else:
+                    print("无效选择")
+                    continue
+            except ValueError:
+                print("请输入有效数字")
+                continue
+        
+        elif choice != "1":
+            print("无效选择")
+            continue
+        
+        # 获取时间范围
+        start_time, end_time = get_time_range_input()
+        
+        # 执行分析
+        analyze_text_lengths(chat_id, start_time, end_time)
+        
+        input("\n按回车键继续...")
+
+
+if __name__ == "__main__":
+    interactive_menu() 
--- a/src/chat/knowledge/embedding_store.py
+++ b/src/chat/knowledge/embedding_store.py
@@ -117,30 +117,36 @@ class EmbeddingStore:
        self.idx2hash = None

    def _get_embedding(self, s: str) -> List[float]:
-        """获取字符串的嵌入向量，处理异步调用"""
+        """获取字符串的嵌入向量，使用完全同步的方式避免事件循环问题"""
+        # 创建新的事件循环并在完成后立即关闭
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
        try:
-            # 尝试获取当前事件循环
-            asyncio.get_running_loop()
-            # 如果在事件循环中，使用线程池执行
-            import concurrent.futures
+            # 创建新的LLMRequest实例
+            from src.llm_models.utils_model import LLMRequest
+            from src.config.config import model_config

-            def run_in_thread():
-                return asyncio.run(get_embedding(s))
+            llm = LLMRequest(model_set=model_config.model_task_config.embedding, request_type="embedding")

-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future = executor.submit(run_in_thread)
-                result = future.result()
-                if result is None:
-                    logger.error(f"获取嵌入失败: {s}")
-                    return []
-                return result
-        except RuntimeError:
-            # 没有运行的事件循环，直接运行
-            result = asyncio.run(get_embedding(s))
-            if result is None:
+            # 使用新的事件循环运行异步方法
+            embedding, _ = loop.run_until_complete(llm.get_embedding(s))
+
+            if embedding and len(embedding) > 0:
+                return embedding
+            else:
                logger.error(f"获取嵌入失败: {s}")
                return []
-            return result
+
+        except Exception as e:
+            logger.error(f"获取嵌入时发生异常: {s}, 错误: {e}")
+            return []
+        finally:
+            # 确保事件循环被正确关闭
+            try:
+                loop.close()
+            except Exception:
+                pass

    def _get_embeddings_batch_threaded(self, strs: List[str], chunk_size: int = 10, max_workers: int = 10, progress_callback=None) -> List[Tuple[str, List[float]]]:
        """使用多线程批量获取嵌入向量
@@ -181,8 +187,14 @@ class EmbeddingStore:
                
                for i, s in enumerate(chunk_strs):
                    try:
-                        # 直接使用异步函数
-                        embedding = asyncio.run(llm.get_embedding(s))
+                        # 在线程中创建独立的事件循环
+                        loop = asyncio.new_event_loop()
+                        asyncio.set_event_loop(loop)
+                        try:
+                            embedding = loop.run_until_complete(llm.get_embedding(s))
+                        finally:
+                            loop.close()
+
                        if embedding and len(embedding) > 0:
                            chunk_results.append((start_idx + i, s, embedding[0]))  # embedding[0] 是实际的向量
                        else:
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@@ -111,6 +111,7 @@ def is_mentioned_bot_in_message(message: MessageRecv) -> tuple[bool, float]:

 async def get_embedding(text, request_type="embedding") -> Optional[List[float]]:
    """获取文本的embedding向量"""
+    # 每次都创建新的LLMRequest实例以避免事件循环冲突
    llm = LLMRequest(model_set=model_config.model_task_config.embedding, request_type=request_type)
    try:
        embedding, _ = await llm.get_embedding(text)
--- a/src/llm_models/model_client/base_client.py
+++ b/src/llm_models/model_client/base_client.py
@@ -159,14 +159,23 @@ class ClientRegistry:

        return decorator

-    def get_client_class_instance(self, api_provider: APIProvider) -> BaseClient:
+    def get_client_class_instance(self, api_provider: APIProvider, force_new=False) -> BaseClient:
        """
        获取注册的API客户端实例
        Args:
            api_provider: APIProvider实例
+            force_new: 是否强制创建新实例（用于解决事件循环问题）
        Returns:
            BaseClient: 注册的API客户端实例
        """
+        # 如果强制创建新实例，直接创建不使用缓存
+        if force_new:
+            if client_class := self.client_registry.get(api_provider.client_type):
+                return client_class(api_provider)
+            else:
+                raise KeyError(f"'{api_provider.client_type}' 类型的 Client 未注册")
+
+        # 正常的缓存逻辑
        if api_provider.name not in self.client_instance_cache:
            if client_class := self.client_registry.get(api_provider.client_type):
                self.client_instance_cache[api_provider.name] = client_class(api_provider)
--- a/src/llm_models/model_client/openai_client.py
+++ b/src/llm_models/model_client/openai_client.py
@@ -380,6 +380,7 @@ class OpenaiClient(BaseClient):
            base_url=api_provider.base_url,
            api_key=api_provider.api_key,
            max_retries=0,
+            timeout=api_provider.timeout,
        )

    async def get_response(
@@ -512,6 +513,11 @@ class OpenaiClient(BaseClient):
                extra_body=extra_params,
            )
        except APIConnectionError as e:
+            # 添加详细的错误信息以便调试
+            logger.error(f"OpenAI API连接错误（嵌入模型）: {str(e)}")
+            logger.error(f"错误类型: {type(e)}")
+            if hasattr(e, '__cause__') and e.__cause__:
+                logger.error(f"底层错误: {str(e.__cause__)}")
            raise NetworkConnectionError() from e
        except APIStatusError as e:
            # 重封装APIError为RespNotOkException
--- a/src/llm_models/utils_model.py
+++ b/src/llm_models/utils_model.py
@@ -416,7 +416,10 @@ class LLMRequest:
        )
        model_info = model_config.get_model_info(least_used_model_name)
        api_provider = model_config.get_provider(model_info.api_provider)
-        client = client_registry.get_client_class_instance(api_provider)
+
+        # 对于嵌入任务，强制创建新的客户端实例以避免事件循环问题
+        force_new_client = (self.request_type == "embedding")
+        client = client_registry.get_client_class_instance(api_provider, force_new=force_new_client)
        logger.debug(f"选择请求模型: {model_info.name}")
        total_tokens, penalty, usage_penalty = self.model_usage[model_info.name]
        self.model_usage[model_info.name] = (total_tokens, penalty, usage_penalty + 1)  # 增加使用惩罚值防止连续使用