feat(memory): 实现三阶段记忆检索系统并简化提取策略

- 移除规则和混合提取策略，统一使用LLM提取 - 实现三阶段检索：元数据粗筛→向量精筛→综合重排 - 新增JSON元数据索引支持，提升检索效率 - 优化Vector DB配置管理和批处理机制 - 统一记忆作用域为全局，实现完全共享 - 增强查询规划和综合评分算法
2025-10-02 10:13:38 +08:00
parent 6f750e2bac
commit 59bda71f29
9 changed files with 814 additions and 297 deletions
--- a/scripts/rebuild_metadata_index.py
+++ b/scripts/rebuild_metadata_index.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+从现有ChromaDB数据重建JSON元数据索引
+"""
+import asyncio
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from src.chat.memory_system.memory_system import MemorySystem
+from src.chat.memory_system.memory_metadata_index import MemoryMetadataIndexEntry
+from src.common.logger import get_logger
+
+logger = get_logger(__name__)
+
+async def rebuild_metadata_index():
+    """从ChromaDB重建元数据索引"""
+    print("="*80)
+    print("重建JSON元数据索引")
+    print("="*80)
+    
+    # 初始化记忆系统
+    print("\n🔧 初始化记忆系统...")
+    ms = MemorySystem()
+    await ms.initialize()
+    print("✅ 记忆系统已初始化")
+    
+    if not hasattr(ms.unified_storage, 'metadata_index'):
+        print("❌ 元数据索引管理器未初始化")
+        return
+    
+    # 获取所有记忆
+    print("\n📥 从ChromaDB获取所有记忆...")
+    from src.common.vector_db import vector_db_service
+    
+    try:
+        # 获取集合中的所有记忆ID
+        collection_name = ms.unified_storage.config.memory_collection
+        result = vector_db_service.get(
+            collection_name=collection_name,
+            include=["documents", "metadatas", "embeddings"]
+        )
+        
+        if not result or not result.get("ids"):
+            print("❌ ChromaDB中没有找到记忆数据")
+            return
+        
+        ids = result["ids"]
+        metadatas = result.get("metadatas", [])
+        
+        print(f"✅ 找到 {len(ids)} 条记忆")
+        
+        # 重建元数据索引
+        print("\n🔨 开始重建元数据索引...")
+        entries = []
+        success_count = 0
+        
+        for i, (memory_id, metadata) in enumerate(zip(ids, metadatas), 1):
+            try:
+                # 从ChromaDB元数据重建索引条目
+                import orjson
+                
+                entry = MemoryMetadataIndexEntry(
+                    memory_id=memory_id,
+                    user_id=metadata.get("user_id", "unknown"),
+                    memory_type=metadata.get("memory_type", "general"),
+                    subjects=orjson.loads(metadata.get("subjects", "[]")),
+                    objects=[metadata.get("object")] if metadata.get("object") else [],
+                    keywords=orjson.loads(metadata.get("keywords", "[]")),
+                    tags=orjson.loads(metadata.get("tags", "[]")),
+                    importance=2,  # 默认NORMAL
+                    confidence=2,  # 默认MEDIUM
+                    created_at=metadata.get("created_at", 0.0),
+                    access_count=metadata.get("access_count", 0),
+                    chat_id=metadata.get("chat_id"),
+                    content_preview=None
+                )
+                
+                # 尝试解析importance和confidence的枚举名称
+                if "importance" in metadata:
+                    imp_str = metadata["importance"]
+                    if imp_str == "LOW":
+                        entry.importance = 1
+                    elif imp_str == "NORMAL":
+                        entry.importance = 2
+                    elif imp_str == "HIGH":
+                        entry.importance = 3
+                    elif imp_str == "CRITICAL":
+                        entry.importance = 4
+                
+                if "confidence" in metadata:
+                    conf_str = metadata["confidence"]
+                    if conf_str == "LOW":
+                        entry.confidence = 1
+                    elif conf_str == "MEDIUM":
+                        entry.confidence = 2
+                    elif conf_str == "HIGH":
+                        entry.confidence = 3
+                    elif conf_str == "VERIFIED":
+                        entry.confidence = 4
+                
+                entries.append(entry)
+                success_count += 1
+                
+                if i % 100 == 0:
+                    print(f"  处理进度: {i}/{len(ids)} ({success_count} 成功)")
+                
+            except Exception as e:
+                logger.warning(f"处理记忆 {memory_id} 失败: {e}")
+                continue
+        
+        print(f"\n✅ 成功解析 {success_count}/{len(ids)} 条记忆元数据")
+        
+        # 批量更新索引
+        print("\n💾 保存元数据索引...")
+        ms.unified_storage.metadata_index.batch_add_or_update(entries)
+        ms.unified_storage.metadata_index.save()
+        
+        # 显示统计信息
+        stats = ms.unified_storage.metadata_index.get_stats()
+        print(f"\n📊 重建后的索引统计:")
+        print(f"  - 总记忆数: {stats['total_memories']}")
+        print(f"  - 主语数量: {stats['subjects_count']}")
+        print(f"  - 关键词数量: {stats['keywords_count']}")
+        print(f"  - 标签数量: {stats['tags_count']}")
+        print(f"  - 类型分布:")
+        for mtype, count in stats['types'].items():
+            print(f"    - {mtype}: {count}")
+        
+        print("\n✅ 元数据索引重建完成！")
+        
+    except Exception as e:
+        logger.error(f"重建索引失败: {e}", exc_info=True)
+        print(f"❌ 重建索引失败: {e}")
+
+if __name__ == '__main__':
+    asyncio.run(rebuild_metadata_index())
--- a/scripts/run_multi_stage_smoke.py
+++ b/scripts/run_multi_stage_smoke.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+轻量烟雾测试：初始化 MemorySystem 并运行一次检索，验证 MemoryMetadata.source 访问不再报错
+"""
+import asyncio
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from src.chat.memory_system.memory_system import MemorySystem
+
+async def main():
+    ms = MemorySystem()
+    await ms.initialize()
+    results = await ms.retrieve_relevant_memories(query_text="测试查询：杰瑞喵喜欢什么？", limit=3)
+    print(f"检索到 {len(results)} 条记忆（如果 >0 则表明运行成功）")
+    for i, m in enumerate(results, 1):
+        print(f"{i}. id={m.metadata.memory_id} source={getattr(m.metadata, 'source', None)}")
+
+if __name__ == '__main__':
+    asyncio.run(main())