From 1396e94a867ec86afacdfc05f4b540c47d8b173c Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 6 Nov 2025 20:56:41 +0800 Subject: [PATCH 1/4] =?UTF-8?q?fix(manager):=20=E4=BC=98=E5=8C=96=E8=AE=B0?= =?UTF-8?q?=E5=BF=86=E6=95=B4=E5=90=88=E9=80=BB=E8=BE=91=EF=BC=8C=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E6=89=B9=E9=87=8F=E5=A4=84=E7=90=86=E9=99=90=E5=88=B6?= =?UTF-8?q?=EF=BC=8C=E6=8F=90=E5=8D=87=E6=80=A7=E8=83=BD=E5=92=8C=E7=A8=B3?= =?UTF-8?q?=E5=AE=9A=E6=80=A7=20fix(config):=20=E6=9B=B4=E6=96=B0=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E6=96=87=E4=BB=B6=E7=89=88=E6=9C=AC=EF=BC=8C=E8=B0=83?= =?UTF-8?q?=E6=95=B4=E8=AE=B0=E5=BF=86=E6=95=B4=E5=90=88=E9=98=88=E5=80=BC?= =?UTF-8?q?=E5=92=8C=E6=97=B6=E9=97=B4=E7=AA=97=E5=8F=A3=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/memory_graph/manager.py | 59 ++++++++++++++++++++++++------- template/bot_config_template.toml | 13 ++++--- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/memory_graph/manager.py b/src/memory_graph/manager.py index d80d08616..9c6ab4e94 100644 --- a/src/memory_graph/manager.py +++ b/src/memory_graph/manager.py @@ -967,14 +967,21 @@ class MemoryManager: async def consolidate_memories( self, similarity_threshold: float = 0.85, - time_window_hours: int = 24, + time_window_hours: float = 24.0, + max_batch_size: int = 50, ) -> Dict[str, Any]: """ - 整理记忆:合并相似记忆 + 整理记忆:直接合并去重相似记忆(不创建新边) + + 优化点: + 1. 添加批量限制,避免长时间阻塞 + 2. 相似记忆直接覆盖合并,不创建关联边 + 3. 使用 asyncio.sleep 让出控制权,避免阻塞事件循环 Args: - similarity_threshold: 相似度阈值 + similarity_threshold: 相似度阈值(默认0.85,建议提高到0.9减少误判) time_window_hours: 时间窗口(小时) + max_batch_size: 单次最多处理的记忆数量 Returns: 整理结果 @@ -983,11 +990,12 @@ class MemoryManager: await self.initialize() try: - logger.info(f"开始记忆整理 (similarity_threshold={similarity_threshold}, time_window={time_window_hours}h)...") + logger.info(f"开始记忆整理 (similarity_threshold={similarity_threshold}, time_window={time_window_hours}h, max_batch={max_batch_size})...") result = { "merged_count": 0, "checked_count": 0, + "skipped_count": 0, } # 获取最近创建的记忆 @@ -1003,6 +1011,12 @@ class MemoryManager: logger.info("没有需要整理的记忆") return result + # 限制批量处理数量 + if len(recent_memories) > max_batch_size: + logger.info(f"记忆数量 {len(recent_memories)} 超过批量限制 {max_batch_size},仅处理最新的 {max_batch_size} 条") + recent_memories = sorted(recent_memories, key=lambda m: m.created_at, reverse=True)[:max_batch_size] + result["skipped_count"] = len(all_memories) - max_batch_size + logger.info(f"找到 {len(recent_memories)} 条待整理记忆") result["checked_count"] = len(recent_memories) @@ -1014,6 +1028,9 @@ class MemoryManager: memories_by_type[mem_type] = [] memories_by_type[mem_type].append(mem) + # 记录已删除的记忆ID,避免重复处理 + deleted_ids = set() + # 对每个类型的记忆进行相似度检测 for mem_type, memories in memories_by_type.items(): if len(memories) < 2: @@ -1023,7 +1040,17 @@ class MemoryManager: # 使用向量相似度检测 for i in range(len(memories)): + # 让出控制权,避免长时间阻塞 + if i % 10 == 0: + await asyncio.sleep(0) + + if memories[i].id in deleted_ids: + continue + for j in range(i + 1, len(memories)): + if memories[j].id in deleted_ids: + continue + mem_i = memories[i] mem_j = memories[j] @@ -1044,23 +1071,28 @@ class MemoryManager: ) if similarity >= similarity_threshold: - # 合并记忆:保留重要性高的,删除另一个 + # 直接去重:保留重要性高的,删除另一个(不创建关联边) if mem_i.importance >= mem_j.importance: keep_mem, remove_mem = mem_i, mem_j else: keep_mem, remove_mem = mem_j, mem_i logger.info( - f"合并相似记忆 (similarity={similarity:.3f}): " + f"去重相似记忆 (similarity={similarity:.3f}): " f"保留 {keep_mem.id}, 删除 {remove_mem.id}" ) - # 增加保留记忆的重要性 - keep_mem.importance = min(1.0, keep_mem.importance + 0.1) - keep_mem.activation = min(1.0, keep_mem.activation + 0.1) + # 增强保留记忆的重要性(合并信息价值) + keep_mem.importance = min(1.0, keep_mem.importance + 0.05) + keep_mem.activation = min(1.0, keep_mem.activation + 0.05) - # 删除相似记忆 + # 将被删除记忆的访问次数累加到保留记忆 + if hasattr(keep_mem, 'access_count') and hasattr(remove_mem, 'access_count'): + keep_mem.access_count += remove_mem.access_count + + # 直接删除相似记忆(不创建边,简化图结构) await self.delete_memory(remove_mem.id) + deleted_ids.add(remove_mem.id) result["merged_count"] += 1 logger.info(f"记忆整理完成: {result}") @@ -1466,10 +1498,13 @@ class MemoryManager: } # 1. 记忆整理(合并相似记忆) + # 默认禁用自动整理,因为可能阻塞主流程 + # 建议:提高阈值到0.92以上,减少误判;限制批量大小避免阻塞 if getattr(self.config, 'consolidation_enabled', False): consolidate_result = await self.consolidate_memories( - similarity_threshold=getattr(self.config, 'consolidation_similarity_threshold', 0.9), - time_window_hours=getattr(self.config, 'consolidation_time_window_hours', 24.0) + similarity_threshold=getattr(self.config, 'consolidation_similarity_threshold', 0.92), + time_window_hours=getattr(self.config, 'consolidation_time_window_hours', 24.0), + max_batch_size=getattr(self.config, 'consolidation_max_batch_size', 50) ) result["consolidated"] = consolidate_result.get("merged_count", 0) diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 7385fc3c0..81f34c347 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -1,5 +1,5 @@ [inner] -version = "7.6.1" +version = "7.6.2" #----以下是给开发人员阅读的,如果你只是部署了MoFox-Bot,不需要阅读---- #如果你想要修改配置文件,请递增version的值 @@ -257,10 +257,13 @@ search_similarity_threshold = 0.5 # 向量相似度阈值 enable_query_optimization = true # 启用查询优化(使用小模型分析对话历史,生成综合性搜索查询) # === 记忆整合配置 === +# 注意:整合任务会遍历所有记忆进行相似度计算,可能占用较多资源 +# 建议:1) 降低执行频率;2) 提高相似度阈值减少误判;3) 限制批量大小 consolidation_enabled = true # 是否启用记忆整合 -consolidation_interval_hours = 1.0 # 整合任务执行间隔(小时) -consolidation_similarity_threshold = 0.85 # 相似记忆合并阈值 -consolidation_time_window_hours = 24 # 整合时间窗口(小时) +consolidation_interval_hours = 1.0 # 整合任务执行间隔 +consolidation_similarity_threshold = 0.92 # 相似记忆去重阈值(建议>=0.92减少误判,0.85太低) +consolidation_time_window_hours = 6.0 # 整合时间窗口(小时) +consolidation_max_batch_size = 50 # 单次最多处理的记忆数量(限制批量避免阻塞) # === 记忆遗忘配置 === forgetting_enabled = true # 是否启用自动遗忘 @@ -270,7 +273,7 @@ forgetting_min_importance = 0.8 # 最小保护重要性(高于此值的记忆 # === 记忆激活配置 === activation_decay_rate = 0.9 # 激活度衰减率(每天衰减10%) activation_propagation_strength = 0.5 # 激活传播强度(传播到相关记忆的激活度比例) -activation_propagation_depth = 1 # 激活传播深度(最多传播几层) +activation_propagation_depth = 2 # 激活传播深度(最多传播几层,建议1-2) # === 记忆检索配置 === search_max_expand_depth = 2 # 检索时图扩展深度(0=仅直接匹配,1=扩展1跳,2=扩展2跳,推荐1-2) From 28d41acc51961252dc92da80faa7c68699824f90 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 6 Nov 2025 21:09:31 +0800 Subject: [PATCH 2/4] =?UTF-8?q?feat(deduplicate=5Fmemories):=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E8=AE=B0=E5=BF=86=E5=8E=BB=E9=87=8D=E5=B7=A5=E5=85=B7?= =?UTF-8?q?=EF=BC=8C=E6=94=AF=E6=8C=81=E9=A2=84=E8=A7=88=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E5=92=8C=E7=9B=B8=E4=BC=BC=E5=BA=A6=E9=98=88=E5=80=BC=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/guides/memory_deduplication_guide.md | 391 +++++++++++++++++++++ scripts/deduplicate_memories.py | 404 ++++++++++++++++++++++ 2 files changed, 795 insertions(+) create mode 100644 docs/guides/memory_deduplication_guide.md create mode 100644 scripts/deduplicate_memories.py diff --git a/docs/guides/memory_deduplication_guide.md b/docs/guides/memory_deduplication_guide.md new file mode 100644 index 000000000..77d346a0c --- /dev/null +++ b/docs/guides/memory_deduplication_guide.md @@ -0,0 +1,391 @@ +# 记忆去重工具使用指南 + +## 📋 功能说明 + +`deduplicate_memories.py` 是一个用于清理重复记忆的工具。它会: + +1. 扫描所有标记为"相似"关系的记忆对 +2. 根据重要性、激活度和创建时间决定保留哪个 +3. 删除重复的记忆,保留最有价值的那个 +4. 提供详细的去重报告 + +## 🚀 快速开始 + +### 步骤1: 预览模式(推荐) + +**首次使用前,建议先运行预览模式,查看会删除哪些记忆:** + +```bash +python scripts/deduplicate_memories.py --dry-run +``` + +输出示例: +``` +============================================================ +记忆去重工具 +============================================================ +数据目录: data/memory_graph +相似度阈值: 0.85 +模式: 预览模式(不实际删除) +============================================================ + +✅ 记忆管理器初始化成功,共 156 条记忆 +找到 23 对相似记忆(阈值>=0.85) + +[预览] 去重相似记忆对 (相似度=0.904): + 保留: mem_20251106_202832_887727 + - 主题: 今天天气很好 + - 重要性: 0.60 + - 激活度: 0.55 + - 创建时间: 2024-11-06 20:28:32 + 删除: mem_20251106_202828_883440 + - 主题: 今天天气晴朗 + - 重要性: 0.50 + - 激活度: 0.50 + - 创建时间: 2024-11-06 20:28:28 + [预览模式] 不执行实际删除 + +============================================================ +去重报告 +============================================================ +总记忆数: 156 +相似记忆对: 23 +发现重复: 23 +预览通过: 23 +错误数: 0 +耗时: 2.35秒 + +⚠️ 这是预览模式,未实际删除任何记忆 +💡 要执行实际删除,请运行: python scripts/deduplicate_memories.py +============================================================ +``` + +### 步骤2: 执行去重 + +**确认预览结果无误后,执行实际去重:** + +```bash +python scripts/deduplicate_memories.py +``` + +输出示例: +``` +============================================================ +记忆去重工具 +============================================================ +数据目录: data/memory_graph +相似度阈值: 0.85 +模式: 执行模式(会实际删除) +============================================================ + +✅ 记忆管理器初始化成功,共 156 条记忆 +找到 23 对相似记忆(阈值>=0.85) + +[执行] 去重相似记忆对 (相似度=0.904): + 保留: mem_20251106_202832_887727 + ... + 删除: mem_20251106_202828_883440 + ... + ✅ 删除成功 + +正在保存数据... +✅ 数据已保存 + +============================================================ +去重报告 +============================================================ +总记忆数: 156 +相似记忆对: 23 +成功删除: 23 +错误数: 0 +耗时: 5.67秒 + +✅ 去重完成! +📊 最终记忆数: 133 (减少 23 条) +============================================================ +``` + +## 🎛️ 命令行参数 + +### `--dry-run`(推荐先使用) + +预览模式,不实际删除任何记忆。 + +```bash +python scripts/deduplicate_memories.py --dry-run +``` + +### `--threshold <相似度>` + +指定相似度阈值,只处理相似度大于等于此值的记忆对。 + +```bash +# 只处理高度相似(>=0.95)的记忆 +python scripts/deduplicate_memories.py --threshold 0.95 + +# 处理中等相似(>=0.8)的记忆 +python scripts/deduplicate_memories.py --threshold 0.8 +``` + +**阈值建议**: +- `0.95-1.0`: 极高相似度,几乎完全相同(最安全) +- `0.9-0.95`: 高度相似,内容基本一致(推荐) +- `0.85-0.9`: 中等相似,可能有细微差别(谨慎使用) +- `<0.85`: 低相似度,可能误删(不推荐) + +### `--data-dir <目录>` + +指定记忆数据目录。 + +```bash +# 对测试数据去重 +python scripts/deduplicate_memories.py --data-dir data/test_memory + +# 对备份数据去重 +python scripts/deduplicate_memories.py --data-dir data/memory_backup +``` + +## 📖 使用场景 + +### 场景1: 定期维护 + +**建议频率**: 每周或每月运行一次 + +```bash +# 1. 先预览 +python scripts/deduplicate_memories.py --dry-run --threshold 0.92 + +# 2. 确认后执行 +python scripts/deduplicate_memories.py --threshold 0.92 +``` + +### 场景2: 清理大量重复 + +**适用于**: 导入外部数据后,或发现大量重复记忆 + +```bash +# 使用较低阈值,清理更多重复 +python scripts/deduplicate_memories.py --threshold 0.85 +``` + +### 场景3: 保守清理 + +**适用于**: 担心误删,只想删除极度相似的记忆 + +```bash +# 使用高阈值,只删除几乎完全相同的记忆 +python scripts/deduplicate_memories.py --threshold 0.98 +``` + +### 场景4: 测试环境 + +**适用于**: 在测试数据上验证效果 + +```bash +# 对测试数据执行去重 +python scripts/deduplicate_memories.py --data-dir data/test_memory --dry-run +``` + +## 🔍 去重策略 + +### 保留原则(按优先级) + +脚本会按以下优先级决定保留哪个记忆: + +1. **重要性更高** (`importance` 值更大) +2. **激活度更高** (`activation` 值更大) +3. **创建时间更早** (更早创建的记忆) + +### 增强保留记忆 + +保留的记忆会获得以下增强: + +- **重要性** +0.05(最高1.0) +- **激活度** +0.05(最高1.0) +- **访问次数** 累加被删除记忆的访问次数 + +### 示例 + +``` +记忆A: 重要性0.8, 激活度0.6, 创建于 2024-11-01 +记忆B: 重要性0.7, 激活度0.9, 创建于 2024-11-05 + +结果: 保留记忆A(重要性更高) +增强: 重要性 0.8 → 0.85, 激活度 0.6 → 0.65 +``` + +## ⚠️ 注意事项 + +### 1. 备份数据 + +**在执行实际去重前,建议备份数据:** + +```bash +# Windows +xcopy data\memory_graph data\memory_graph_backup /E /I /Y + +# Linux/Mac +cp -r data/memory_graph data/memory_graph_backup +``` + +### 2. 先预览再执行 + +**务必先运行 `--dry-run` 预览:** + +```bash +# 错误示范 ❌ +python scripts/deduplicate_memories.py # 直接执行 + +# 正确示范 ✅ +python scripts/deduplicate_memories.py --dry-run # 先预览 +python scripts/deduplicate_memories.py # 再执行 +``` + +### 3. 阈值选择 + +**过低的阈值可能导致误删:** + +```bash +# 风险较高 ⚠️ +python scripts/deduplicate_memories.py --threshold 0.7 + +# 推荐范围 ✅ +python scripts/deduplicate_memories.py --threshold 0.92 +``` + +### 4. 不可恢复 + +**删除的记忆无法恢复!** 如果不确定,请: + +1. 先备份数据 +2. 使用 `--dry-run` 预览 +3. 使用较高的阈值(如 0.95) + +### 5. 中断恢复 + +如果执行过程中中断(Ctrl+C),已删除的记忆无法恢复。建议: + +- 在低负载时段运行 +- 确保足够的执行时间 +- 使用 `--threshold` 限制处理数量 + +## 🐛 故障排查 + +### 问题1: 找不到相似记忆对 + +``` +找到 0 对相似记忆(阈值>=0.85) +``` + +**原因**: +- 没有标记为"相似"的边 +- 阈值设置过高 + +**解决**: +1. 降低阈值:`--threshold 0.7` +2. 检查记忆系统是否正确创建了相似关系 +3. 先运行自动关联任务 + +### 问题2: 初始化失败 + +``` +❌ 记忆管理器初始化失败 +``` + +**原因**: +- 数据目录不存在 +- 配置文件错误 +- 数据文件损坏 + +**解决**: +1. 检查数据目录是否存在 +2. 验证配置文件:`config/bot_config.toml` +3. 查看详细日志定位问题 + +### 问题3: 删除失败 + +``` +❌ 删除失败: ... +``` + +**原因**: +- 权限不足 +- 数据库锁定 +- 文件损坏 + +**解决**: +1. 检查文件权限 +2. 确保没有其他进程占用数据 +3. 恢复备份后重试 + +## 📊 性能参考 + +| 记忆数量 | 相似对数 | 执行时间(预览) | 执行时间(实际) | +|---------|---------|----------------|----------------| +| 100 | 10 | ~1秒 | ~2秒 | +| 500 | 50 | ~3秒 | ~6秒 | +| 1000 | 100 | ~5秒 | ~12秒 | +| 5000 | 500 | ~15秒 | ~45秒 | + +**注**: 实际时间取决于服务器性能和数据复杂度 + +## 🔗 相关工具 + +- **记忆整理**: `src/memory_graph/manager.py::consolidate_memories()` +- **自动关联**: `src/memory_graph/manager.py::auto_link_memories()` +- **配置验证**: `scripts/verify_config_update.py` + +## 💡 最佳实践 + +### 1. 定期维护流程 + +```bash +# 每周执行 +cd /path/to/bot + +# 1. 备份 +cp -r data/memory_graph data/memory_graph_backup_$(date +%Y%m%d) + +# 2. 预览 +python scripts/deduplicate_memories.py --dry-run --threshold 0.92 + +# 3. 执行 +python scripts/deduplicate_memories.py --threshold 0.92 + +# 4. 验证 +python scripts/verify_config_update.py +``` + +### 2. 保守去重策略 + +```bash +# 只删除极度相似的记忆 +python scripts/deduplicate_memories.py --dry-run --threshold 0.98 +python scripts/deduplicate_memories.py --threshold 0.98 +``` + +### 3. 批量清理策略 + +```bash +# 先清理高相似度的 +python scripts/deduplicate_memories.py --threshold 0.95 + +# 再清理中相似度的(可选) +python scripts/deduplicate_memories.py --dry-run --threshold 0.9 +python scripts/deduplicate_memories.py --threshold 0.9 +``` + +## 📝 总结 + +- ✅ **务必先备份数据** +- ✅ **务必先运行 `--dry-run`** +- ✅ **建议使用阈值 >= 0.92** +- ✅ **定期运行,保持记忆库清洁** +- ❌ **避免过低阈值(< 0.85)** +- ❌ **避免跳过预览直接执行** + +--- + +**创建日期**: 2024-11-06 +**版本**: v1.0 +**维护者**: MoFox-Bot Team diff --git a/scripts/deduplicate_memories.py b/scripts/deduplicate_memories.py new file mode 100644 index 000000000..936fd9014 --- /dev/null +++ b/scripts/deduplicate_memories.py @@ -0,0 +1,404 @@ +""" +记忆去重工具 + +功能: +1. 扫描所有标记为"相似"关系的记忆边 +2. 对相似记忆进行去重(保留重要性高的,删除另一个) +3. 支持干运行模式(预览不执行) +4. 提供详细的去重报告 + +使用方法: + # 预览模式(不实际删除) + python scripts/deduplicate_memories.py --dry-run + + # 执行去重 + python scripts/deduplicate_memories.py + + # 指定相似度阈值 + python scripts/deduplicate_memories.py --threshold 0.9 + + # 指定数据目录 + python scripts/deduplicate_memories.py --data-dir data/memory_graph +""" +import argparse +import asyncio +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple + +import numpy as np + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.common.logger import get_logger +from src.memory_graph.manager_singleton import get_memory_manager, initialize_memory_manager, shutdown_memory_manager + +logger = get_logger(__name__) + + +class MemoryDeduplicator: + """记忆去重器""" + + def __init__(self, data_dir: str = "data/memory_graph", dry_run: bool = False, threshold: float = 0.85): + self.data_dir = data_dir + self.dry_run = dry_run + self.threshold = threshold + self.manager = None + + # 统计信息 + self.stats = { + "total_memories": 0, + "similar_pairs": 0, + "duplicates_found": 0, + "duplicates_removed": 0, + "errors": 0, + } + + async def initialize(self): + """初始化记忆管理器""" + logger.info(f"正在初始化记忆管理器 (data_dir={self.data_dir})...") + self.manager = await initialize_memory_manager(data_dir=self.data_dir) + if not self.manager: + raise RuntimeError("记忆管理器初始化失败") + + self.stats["total_memories"] = len(self.manager.graph_store.get_all_memories()) + logger.info(f"✅ 记忆管理器初始化成功,共 {self.stats['total_memories']} 条记忆") + + async def find_similar_pairs(self) -> List[Tuple[str, str, float]]: + """ + 查找所有相似的记忆对(通过向量相似度计算) + + Returns: + [(memory_id_1, memory_id_2, similarity), ...] + """ + logger.info("正在扫描相似记忆对...") + similar_pairs = [] + seen_pairs = set() # 避免重复 + + # 获取所有记忆 + all_memories = self.manager.graph_store.get_all_memories() + total_memories = len(all_memories) + + logger.info(f"开始计算 {total_memories} 条记忆的相似度...") + + # 两两比较记忆的相似度 + for i, memory_i in enumerate(all_memories): + # 每处理10条记忆让出控制权 + if i % 10 == 0: + await asyncio.sleep(0) + if i > 0: + logger.info(f"进度: {i}/{total_memories} ({i*100//total_memories}%)") + + # 获取记忆i的向量(从主题节点) + vector_i = None + for node in memory_i.nodes: + if node.embedding is not None: + vector_i = node.embedding + break + + if vector_i is None: + continue + + # 与后续记忆比较 + for j in range(i + 1, total_memories): + memory_j = all_memories[j] + + # 获取记忆j的向量 + vector_j = None + for node in memory_j.nodes: + if node.embedding is not None: + vector_j = node.embedding + break + + if vector_j is None: + continue + + # 计算余弦相似度 + similarity = self._cosine_similarity(vector_i, vector_j) + + # 只保存满足阈值的相似对 + if similarity >= self.threshold: + pair_key = tuple(sorted([memory_i.id, memory_j.id])) + if pair_key not in seen_pairs: + seen_pairs.add(pair_key) + similar_pairs.append((memory_i.id, memory_j.id, similarity)) + + self.stats["similar_pairs"] = len(similar_pairs) + logger.info(f"找到 {len(similar_pairs)} 对相似记忆(阈值>={self.threshold})") + + return similar_pairs + + def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float: + """计算余弦相似度""" + try: + vec1_norm = np.linalg.norm(vec1) + vec2_norm = np.linalg.norm(vec2) + + if vec1_norm == 0 or vec2_norm == 0: + return 0.0 + + similarity = np.dot(vec1, vec2) / (vec1_norm * vec2_norm) + return float(similarity) + except Exception as e: + logger.error(f"计算余弦相似度失败: {e}") + return 0.0 + + def decide_which_to_keep(self, mem_id_1: str, mem_id_2: str) -> Tuple[Optional[str], Optional[str]]: + """ + 决定保留哪个记忆,删除哪个 + + 优先级: + 1. 重要性更高的 + 2. 激活度更高的 + 3. 创建时间更早的 + + Returns: + (keep_id, remove_id) + """ + mem1 = self.manager.graph_store.get_memory_by_id(mem_id_1) + mem2 = self.manager.graph_store.get_memory_by_id(mem_id_2) + + if not mem1 or not mem2: + logger.warning(f"记忆不存在: {mem_id_1} or {mem_id_2}") + return None, None + + # 比较重要性 + if mem1.importance > mem2.importance: + return mem_id_1, mem_id_2 + elif mem1.importance < mem2.importance: + return mem_id_2, mem_id_1 + + # 重要性相同,比较激活度 + if mem1.activation > mem2.activation: + return mem_id_1, mem_id_2 + elif mem1.activation < mem2.activation: + return mem_id_2, mem_id_1 + + # 激活度也相同,保留更早创建的 + if mem1.created_at < mem2.created_at: + return mem_id_1, mem_id_2 + else: + return mem_id_2, mem_id_1 + + async def deduplicate_pair(self, mem_id_1: str, mem_id_2: str, similarity: float) -> bool: + """ + 去重一对相似记忆 + + Returns: + 是否成功去重 + """ + keep_id, remove_id = self.decide_which_to_keep(mem_id_1, mem_id_2) + + if not keep_id or not remove_id: + self.stats["errors"] += 1 + return False + + keep_mem = self.manager.graph_store.get_memory_by_id(keep_id) + remove_mem = self.manager.graph_store.get_memory_by_id(remove_id) + + logger.info(f"") + logger.info(f"{'[预览]' if self.dry_run else '[执行]'} 去重相似记忆对 (相似度={similarity:.3f}):") + logger.info(f" 保留: {keep_id}") + logger.info(f" - 主题: {keep_mem.metadata.get('topic', 'N/A')}") + logger.info(f" - 重要性: {keep_mem.importance:.2f}") + logger.info(f" - 激活度: {keep_mem.activation:.2f}") + logger.info(f" - 创建时间: {keep_mem.created_at}") + logger.info(f" 删除: {remove_id}") + logger.info(f" - 主题: {remove_mem.metadata.get('topic', 'N/A')}") + logger.info(f" - 重要性: {remove_mem.importance:.2f}") + logger.info(f" - 激活度: {remove_mem.activation:.2f}") + logger.info(f" - 创建时间: {remove_mem.created_at}") + + if self.dry_run: + logger.info(" [预览模式] 不执行实际删除") + self.stats["duplicates_found"] += 1 + return True + + try: + # 增强保留记忆的属性 + keep_mem.importance = min(1.0, keep_mem.importance + 0.05) + keep_mem.activation = min(1.0, keep_mem.activation + 0.05) + + # 累加访问次数 + if hasattr(keep_mem, 'access_count') and hasattr(remove_mem, 'access_count'): + keep_mem.access_count += remove_mem.access_count + + # 删除相似记忆 + await self.manager.delete_memory(remove_id) + + self.stats["duplicates_removed"] += 1 + logger.info(f" ✅ 删除成功") + + # 让出控制权 + await asyncio.sleep(0) + + return True + + except Exception as e: + logger.error(f" ❌ 删除失败: {e}", exc_info=True) + self.stats["errors"] += 1 + return False + + async def run(self): + """执行去重""" + start_time = datetime.now() + + print("="*70) + print("记忆去重工具") + print("="*70) + print(f"数据目录: {self.data_dir}") + print(f"相似度阈值: {self.threshold}") + print(f"模式: {'预览模式(不实际删除)' if self.dry_run else '执行模式(会实际删除)'}") + print("="*70) + print() + + # 初始化 + await self.initialize() + + # 查找相似对 + similar_pairs = await self.find_similar_pairs() + + if not similar_pairs: + logger.info("未找到需要去重的相似记忆对") + print() + print("="*70) + print("未找到需要去重的记忆") + print("="*70) + return + + # 去重处理 + logger.info(f"开始{'预览' if self.dry_run else '执行'}去重...") + print() + + processed_pairs = set() # 避免重复处理 + + for mem_id_1, mem_id_2, similarity in similar_pairs: + # 检查是否已处理(可能一个记忆已被删除) + pair_key = tuple(sorted([mem_id_1, mem_id_2])) + if pair_key in processed_pairs: + continue + + # 检查记忆是否仍存在 + if not self.manager.graph_store.get_memory_by_id(mem_id_1): + logger.debug(f"记忆 {mem_id_1} 已不存在,跳过") + continue + if not self.manager.graph_store.get_memory_by_id(mem_id_2): + logger.debug(f"记忆 {mem_id_2} 已不存在,跳过") + continue + + # 执行去重 + success = await self.deduplicate_pair(mem_id_1, mem_id_2, similarity) + + if success: + processed_pairs.add(pair_key) + + # 保存数据(如果不是干运行) + if not self.dry_run: + logger.info("正在保存数据...") + await self.manager.persistence.save_graph_store(self.manager.graph_store) + logger.info("✅ 数据已保存") + + # 统计报告 + elapsed = (datetime.now() - start_time).total_seconds() + + print() + print("="*70) + print("去重报告") + print("="*70) + print(f"总记忆数: {self.stats['total_memories']}") + print(f"相似记忆对: {self.stats['similar_pairs']}") + print(f"发现重复: {self.stats['duplicates_found'] if self.dry_run else self.stats['duplicates_removed']}") + print(f"{'预览通过' if self.dry_run else '成功删除'}: {self.stats['duplicates_found'] if self.dry_run else self.stats['duplicates_removed']}") + print(f"错误数: {self.stats['errors']}") + print(f"耗时: {elapsed:.2f}秒") + + if self.dry_run: + print() + print("⚠️ 这是预览模式,未实际删除任何记忆") + print("💡 要执行实际删除,请运行: python scripts/deduplicate_memories.py") + else: + print() + print("✅ 去重完成!") + final_count = len(self.manager.graph_store.get_all_memories()) + print(f"📊 最终记忆数: {final_count} (减少 {self.stats['total_memories'] - final_count} 条)") + + print("="*70) + + async def cleanup(self): + """清理资源""" + if self.manager: + await shutdown_memory_manager() + + +async def main(): + """主函数""" + parser = argparse.ArgumentParser( + description="记忆去重工具 - 对标记为相似的记忆进行一键去重", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例: + # 预览模式(推荐先运行) + python scripts/deduplicate_memories.py --dry-run + + # 执行去重 + python scripts/deduplicate_memories.py + + # 指定相似度阈值(只处理相似度>=0.9的记忆对) + python scripts/deduplicate_memories.py --threshold 0.9 + + # 指定数据目录 + python scripts/deduplicate_memories.py --data-dir data/memory_graph + + # 组合使用 + python scripts/deduplicate_memories.py --dry-run --threshold 0.95 --data-dir data/test + """ + ) + + parser.add_argument( + "--dry-run", + action="store_true", + help="预览模式,不实际删除记忆(推荐先运行此模式)" + ) + + parser.add_argument( + "--threshold", + type=float, + default=0.85, + help="相似度阈值,只处理相似度>=此值的记忆对(默认: 0.85)" + ) + + parser.add_argument( + "--data-dir", + type=str, + default="data/memory_graph", + help="记忆数据目录(默认: data/memory_graph)" + ) + + args = parser.parse_args() + + # 创建去重器 + deduplicator = MemoryDeduplicator( + data_dir=args.data_dir, + dry_run=args.dry_run, + threshold=args.threshold + ) + + try: + # 执行去重 + await deduplicator.run() + except KeyboardInterrupt: + print("\n\n⚠️ 用户中断操作") + except Exception as e: + logger.error(f"执行失败: {e}", exc_info=True) + print(f"\n❌ 执行失败: {e}") + return 1 + finally: + # 清理资源 + await deduplicator.cleanup() + + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) From a2ce020099df387f53a8955018fafcf6cfd4c7b8 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 6 Nov 2025 21:53:55 +0800 Subject: [PATCH 3/4] =?UTF-8?q?feat(memory-graph):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E8=AE=B0=E5=BF=86=E5=9B=BE=E7=B3=BB=E7=BB=9F=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=EF=BC=8C=E6=95=B4=E5=90=88=E8=8A=82=E7=82=B9=E5=8E=BB=E9=87=8D?= =?UTF-8?q?=E5=90=88=E5=B9=B6=E5=99=A8=E4=B8=8E=E8=AE=B0=E5=BF=86=E7=AE=A1?= =?UTF-8?q?=E7=90=86=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/config/official_configs.py | 53 +++++ src/memory_graph/config.py | 272 ------------------------- src/memory_graph/core/node_merger.py | 19 +- src/memory_graph/manager.py | 13 +- src/memory_graph/tools/memory_tools.py | 2 + 5 files changed, 74 insertions(+), 285 deletions(-) delete mode 100644 src/memory_graph/config.py diff --git a/src/config/official_configs.py b/src/config/official_configs.py index 71273d40b..a8aa295ed 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -401,6 +401,59 @@ class MemoryConfig(ValidatedConfigBase): memory_system_load_balancing: bool = Field(default=True, description="启用记忆系统负载均衡") memory_build_throttling: bool = Field(default=True, description="启用记忆构建节流") memory_priority_queue_enabled: bool = Field(default=True, description="启用记忆优先级队列") + + # === 记忆图系统配置 (Memory Graph System) === + # 新一代记忆系统的配置项 + enable: bool = Field(default=True, description="启用记忆图系统") + data_dir: str = Field(default="data/memory_graph", description="记忆数据存储目录") + + # 向量存储配置 + vector_collection_name: str = Field(default="memory_nodes", description="向量集合名称") + vector_db_path: str = Field(default="data/memory_graph/chroma_db", description="向量数据库路径") + + # 检索配置 + search_top_k: int = Field(default=10, description="默认检索返回数量") + search_min_importance: float = Field(default=0.3, description="最小重要性阈值") + search_similarity_threshold: float = Field(default=0.5, description="向量相似度阈值") + search_max_expand_depth: int = Field(default=2, description="检索时图扩展深度(0-3)") + enable_query_optimization: bool = Field(default=True, description="启用查询优化") + + # 检索权重配置 (记忆图系统) + search_vector_weight: float = Field(default=0.4, description="向量相似度权重") + search_graph_distance_weight: float = Field(default=0.2, description="图距离权重") + search_importance_weight: float = Field(default=0.2, description="重要性权重") + search_recency_weight: float = Field(default=0.2, description="时效性权重") + + # 记忆整合配置 + consolidation_enabled: bool = Field(default=False, description="是否启用记忆整合") + consolidation_interval_hours: float = Field(default=6.0, description="整合任务执行间隔(小时)") + consolidation_similarity_threshold: float = Field(default=0.92, description="相似记忆去重阈值") + consolidation_time_window_hours: float = Field(default=6.0, description="整合时间窗口(小时)") + consolidation_max_batch_size: int = Field(default=50, description="单次最多处理的记忆数量") + + # 自动关联配置 + auto_link_enabled: bool = Field(default=True, description="是否启用自动关联") + auto_link_max_candidates: int = Field(default=5, description="每个记忆最多关联候选数") + auto_link_min_confidence: float = Field(default=0.7, description="最低置信度阈值") + + # 遗忘配置 (记忆图系统) + forgetting_enabled: bool = Field(default=True, description="是否启用自动遗忘") + forgetting_activation_threshold: float = Field(default=0.1, description="激活度阈值") + forgetting_min_importance: float = Field(default=0.8, description="最小保护重要性") + + # 激活配置 + activation_decay_rate: float = Field(default=0.9, description="激活度衰减率") + activation_propagation_strength: float = Field(default=0.5, description="激活传播强度") + activation_propagation_depth: int = Field(default=2, description="激活传播深度") + + # 性能配置 + max_memory_nodes_per_memory: int = Field(default=10, description="每个记忆最多包含的节点数") + max_related_memories: int = Field(default=5, description="相关记忆最大数量") + + # 节点去重合并配置 + node_merger_similarity_threshold: float = Field(default=0.85, description="节点去重相似度阈值") + node_merger_context_match_required: bool = Field(default=True, description="节点合并是否要求上下文匹配") + node_merger_merge_batch_size: int = Field(default=50, description="节点合并批量处理大小") class MoodConfig(ValidatedConfigBase): diff --git a/src/memory_graph/config.py b/src/memory_graph/config.py deleted file mode 100644 index 4aa8c94da..000000000 --- a/src/memory_graph/config.py +++ /dev/null @@ -1,272 +0,0 @@ -""" -记忆图系统配置管理 -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from pathlib import Path -from typing import Dict, Optional - - -@dataclass -class ConsolidationConfig: - """记忆整理配置""" - - interval_hours: int = 6 # 整理间隔(小时) - batch_size: int = 100 # 每次处理记忆数量 - enable_auto_discovery: bool = True # 是否启用自动关联发现 - enable_conflict_detection: bool = True # 是否启用冲突检测 - - -@dataclass -class RetrievalConfig: - """记忆检索配置""" - - default_mode: str = "auto" # auto/fast/deep - max_expand_depth: int = 2 # 最大图扩展深度 - vector_weight: float = 0.4 # 向量相似度权重 - graph_distance_weight: float = 0.2 # 图距离权重 - importance_weight: float = 0.2 # 重要性权重 - recency_weight: float = 0.2 # 时效性权重 - - def __post_init__(self): - """验证权重总和""" - total = self.vector_weight + self.graph_distance_weight + self.importance_weight + self.recency_weight - if abs(total - 1.0) > 0.01: - raise ValueError(f"权重总和必须为1.0,当前为 {total}") - - -@dataclass -class NodeMergerConfig: - """节点去重配置""" - - similarity_threshold: float = 0.85 # 相似度阈值 - context_match_required: bool = True # 是否要求上下文匹配 - merge_batch_size: int = 50 # 批量处理大小 - - def __post_init__(self): - """验证阈值范围""" - if not 0.0 <= self.similarity_threshold <= 1.0: - raise ValueError(f"相似度阈值必须在 [0, 1] 范围内,当前为 {self.similarity_threshold}") - - -@dataclass -class StorageConfig: - """存储配置""" - - data_dir: Path = field(default_factory=lambda: Path("data/memory_graph")) - vector_collection_name: str = "memory_nodes" - graph_file_name: str = "memory_graph.json" - enable_persistence: bool = True # 是否启用持久化 - auto_save_interval: int = 300 # 自动保存间隔(秒) - - -@dataclass -class MemoryGraphConfig: - """记忆图系统总配置""" - - # 基础配置 - enable: bool = True # 是否启用记忆图系统 - data_dir: Path = field(default_factory=lambda: Path("data/memory_graph")) - - # 向量存储配置 - vector_collection_name: str = "memory_nodes" - vector_db_path: Path = field(default_factory=lambda: Path("data/memory_graph/chroma_db")) - - # 检索配置 - search_top_k: int = 10 - search_min_importance: float = 0.3 - search_similarity_threshold: float = 0.5 - enable_query_optimization: bool = True - - # 整合配置 - consolidation_enabled: bool = True - consolidation_interval_hours: float = 1.0 - consolidation_similarity_threshold: float = 0.85 - consolidation_time_window_hours: int = 24 - - # 自动关联配置 - auto_link_enabled: bool = True # 是否启用自动关联 - auto_link_max_candidates: int = 5 # 每个记忆最多关联候选数 - auto_link_min_confidence: float = 0.7 # 最低置信度阈值 - - # 遗忘配置 - forgetting_enabled: bool = True - forgetting_activation_threshold: float = 0.1 - forgetting_min_importance: float = 0.8 - - # 激活配置 - activation_decay_rate: float = 0.9 - activation_propagation_strength: float = 0.5 - activation_propagation_depth: int = 1 - - # 性能配置 - max_memory_nodes_per_memory: int = 10 - max_related_memories: int = 5 - - # 旧配置(向后兼容) - consolidation: ConsolidationConfig = field(default_factory=ConsolidationConfig) - retrieval: RetrievalConfig = field(default_factory=RetrievalConfig) - node_merger: NodeMergerConfig = field(default_factory=NodeMergerConfig) - storage: StorageConfig = field(default_factory=StorageConfig) - - # 时间衰减配置 - decay_rates: Dict[str, float] = field( - default_factory=lambda: { - "EVENT": 0.05, # 事件衰减较快 - "FACT": 0.01, # 事实衰减慢 - "RELATION": 0.005, # 关系衰减很慢 - "OPINION": 0.03, # 观点中等衰减 - } - ) - - # 嵌入模型配置 - embedding_model: Optional[str] = None # 如果为None,则使用系统默认 - embedding_dimension: int = 384 # 默认使用 sentence-transformers 的维度 - - # 调试和日志 - enable_debug_logging: bool = False - enable_visualization: bool = False # 是否启用记忆可视化 - - @classmethod - def from_bot_config(cls, bot_config) -> MemoryGraphConfig: - """从bot_config加载配置""" - try: - # 尝试获取配置(优先使用memory,兼容memory_graph) - if hasattr(bot_config, 'memory') and bot_config.memory is not None: - mg_config = bot_config.memory - elif hasattr(bot_config, 'memory_graph'): - mg_config = bot_config.memory_graph - - config = cls( - enable=getattr(mg_config, 'enable', True), - data_dir=Path(getattr(mg_config, 'data_dir', 'data/memory_graph')), - vector_collection_name=getattr(mg_config, 'vector_collection_name', 'memory_nodes'), - vector_db_path=Path(getattr(mg_config, 'vector_db_path', 'data/memory_graph/chroma_db')), - search_top_k=getattr(mg_config, 'search_top_k', 10), - search_min_importance=getattr(mg_config, 'search_min_importance', 0.3), - search_similarity_threshold=getattr(mg_config, 'search_similarity_threshold', 0.5), - enable_query_optimization=getattr(mg_config, 'enable_query_optimization', True), - consolidation_enabled=getattr(mg_config, 'consolidation_enabled', True), - consolidation_interval_hours=getattr(mg_config, 'consolidation_interval_hours', 1.0), - consolidation_similarity_threshold=getattr(mg_config, 'consolidation_similarity_threshold', 0.85), - consolidation_time_window_hours=getattr(mg_config, 'consolidation_time_window_hours', 24), - auto_link_enabled=getattr(mg_config, 'auto_link_enabled', True), - auto_link_max_candidates=getattr(mg_config, 'auto_link_max_candidates', 5), - auto_link_min_confidence=getattr(mg_config, 'auto_link_min_confidence', 0.7), - forgetting_enabled=getattr(mg_config, 'forgetting_enabled', True), - forgetting_activation_threshold=getattr(mg_config, 'forgetting_activation_threshold', 0.1), - forgetting_min_importance=getattr(mg_config, 'forgetting_min_importance', 0.8), - activation_decay_rate=getattr(mg_config, 'activation_decay_rate', 0.9), - activation_propagation_strength=getattr(mg_config, 'activation_propagation_strength', 0.5), - activation_propagation_depth=getattr(mg_config, 'activation_propagation_depth', 1), - max_memory_nodes_per_memory=getattr(mg_config, 'max_memory_nodes_per_memory', 10), - max_related_memories=getattr(mg_config, 'max_related_memories', 5), - # 检索配置 - retrieval=RetrievalConfig( - max_expand_depth=getattr(mg_config, 'search_max_expand_depth', 2), - vector_weight=getattr(mg_config, 'search_vector_weight', 0.4), - graph_distance_weight=getattr(mg_config, 'search_graph_distance_weight', 0.2), - importance_weight=getattr(mg_config, 'search_importance_weight', 0.2), - recency_weight=getattr(mg_config, 'search_recency_weight', 0.2), - ), - ) - - return config - else: - # 没有找到memory_graph配置,使用默认值 - return cls() - - except Exception as e: - import logging - logger = logging.getLogger(__name__) - logger.warning(f"从bot_config加载memory_graph配置失败,使用默认配置: {e}") - return cls() - - @classmethod - def from_dict(cls, config_dict: Dict) -> MemoryGraphConfig: - """从字典创建配置""" - return cls( - # 新配置字段 - enable=config_dict.get("enable", True), - data_dir=Path(config_dict.get("data_dir", "data/memory_graph")), - vector_collection_name=config_dict.get("vector_collection_name", "memory_nodes"), - vector_db_path=Path(config_dict.get("vector_db_path", "data/memory_graph/chroma_db")), - search_top_k=config_dict.get("search_top_k", 10), - search_min_importance=config_dict.get("search_min_importance", 0.3), - search_similarity_threshold=config_dict.get("search_similarity_threshold", 0.5), - enable_query_optimization=config_dict.get("enable_query_optimization", True), - consolidation_enabled=config_dict.get("consolidation_enabled", True), - consolidation_interval_hours=config_dict.get("consolidation_interval_hours", 1.0), - consolidation_similarity_threshold=config_dict.get("consolidation_similarity_threshold", 0.85), - consolidation_time_window_hours=config_dict.get("consolidation_time_window_hours", 24), - auto_link_enabled=config_dict.get("auto_link_enabled", True), - auto_link_max_candidates=config_dict.get("auto_link_max_candidates", 5), - auto_link_min_confidence=config_dict.get("auto_link_min_confidence", 0.7), - forgetting_enabled=config_dict.get("forgetting_enabled", True), - forgetting_activation_threshold=config_dict.get("forgetting_activation_threshold", 0.1), - forgetting_min_importance=config_dict.get("forgetting_min_importance", 0.8), - activation_decay_rate=config_dict.get("activation_decay_rate", 0.9), - activation_propagation_strength=config_dict.get("activation_propagation_strength", 0.5), - activation_propagation_depth=config_dict.get("activation_propagation_depth", 1), - max_memory_nodes_per_memory=config_dict.get("max_memory_nodes_per_memory", 10), - max_related_memories=config_dict.get("max_related_memories", 5), - # 旧配置字段(向后兼容) - consolidation=ConsolidationConfig(**config_dict.get("consolidation", {})), - retrieval=RetrievalConfig( - max_expand_depth=config_dict.get("search_max_expand_depth", 2), - vector_weight=config_dict.get("search_vector_weight", 0.4), - graph_distance_weight=config_dict.get("search_graph_distance_weight", 0.2), - importance_weight=config_dict.get("search_importance_weight", 0.2), - recency_weight=config_dict.get("search_recency_weight", 0.2), - **config_dict.get("retrieval", {}) - ), - node_merger=NodeMergerConfig(**config_dict.get("node_merger", {})), - storage=StorageConfig(**config_dict.get("storage", {})), - decay_rates=config_dict.get("decay_rates", cls().decay_rates), - embedding_model=config_dict.get("embedding_model"), - embedding_dimension=config_dict.get("embedding_dimension", 384), - enable_debug_logging=config_dict.get("enable_debug_logging", False), - enable_visualization=config_dict.get("enable_visualization", False), - ) - - def to_dict(self) -> Dict: - """转换为字典""" - return { - "consolidation": { - "interval_hours": self.consolidation.interval_hours, - "batch_size": self.consolidation.batch_size, - "enable_auto_discovery": self.consolidation.enable_auto_discovery, - "enable_conflict_detection": self.consolidation.enable_conflict_detection, - }, - "retrieval": { - "default_mode": self.retrieval.default_mode, - "max_expand_depth": self.retrieval.max_expand_depth, - "vector_weight": self.retrieval.vector_weight, - "graph_distance_weight": self.retrieval.graph_distance_weight, - "importance_weight": self.retrieval.importance_weight, - "recency_weight": self.retrieval.recency_weight, - }, - "node_merger": { - "similarity_threshold": self.node_merger.similarity_threshold, - "context_match_required": self.node_merger.context_match_required, - "merge_batch_size": self.node_merger.merge_batch_size, - }, - "storage": { - "data_dir": str(self.storage.data_dir), - "vector_collection_name": self.storage.vector_collection_name, - "graph_file_name": self.storage.graph_file_name, - "enable_persistence": self.storage.enable_persistence, - "auto_save_interval": self.storage.auto_save_interval, - }, - "decay_rates": self.decay_rates, - "embedding_model": self.embedding_model, - "embedding_dimension": self.embedding_dimension, - "enable_debug_logging": self.enable_debug_logging, - "enable_visualization": self.enable_visualization, - } - - -# 默认配置实例 -DEFAULT_CONFIG = MemoryGraphConfig() diff --git a/src/memory_graph/core/node_merger.py b/src/memory_graph/core/node_merger.py index 378aa5f83..e8b790f1e 100644 --- a/src/memory_graph/core/node_merger.py +++ b/src/memory_graph/core/node_merger.py @@ -4,12 +4,13 @@ from __future__ import annotations +from dataclasses import dataclass from typing import List, Optional, Tuple import numpy as np from src.common.logger import get_logger -from src.memory_graph.config import NodeMergerConfig +from src.config.official_configs import MemoryConfig from src.memory_graph.models import MemoryNode, NodeType from src.memory_graph.storage.graph_store import GraphStore from src.memory_graph.storage.vector_store import VectorStore @@ -31,7 +32,7 @@ class NodeMerger: self, vector_store: VectorStore, graph_store: GraphStore, - config: Optional[NodeMergerConfig] = None, + config: MemoryConfig, ): """ 初始化节点合并器 @@ -39,15 +40,15 @@ class NodeMerger: Args: vector_store: 向量存储 graph_store: 图存储 - config: 配置对象 + config: 记忆配置对象 """ self.vector_store = vector_store self.graph_store = graph_store - self.config = config or NodeMergerConfig() + self.config = config logger.info( - f"初始化节点合并器: threshold={self.config.similarity_threshold}, " - f"context_match={self.config.context_match_required}" + f"初始化节点合并器: threshold={self.config.node_merger_similarity_threshold}, " + f"context_match={self.config.node_merger_context_match_required}" ) async def find_similar_nodes( @@ -71,7 +72,7 @@ class NodeMerger: logger.warning(f"节点 {node.id} 没有 embedding,无法查找相似节点") return [] - threshold = threshold or self.config.similarity_threshold + threshold = threshold or self.config.node_merger_similarity_threshold try: # 在向量存储中搜索相似节点 @@ -121,7 +122,7 @@ class NodeMerger: 是否应该合并 """ # 1. 检查相似度阈值 - if similarity < self.config.similarity_threshold: + if similarity < self.config.node_merger_similarity_threshold: return False # 2. 非常高的相似度(>0.95)直接合并 @@ -130,7 +131,7 @@ class NodeMerger: return True # 3. 如果不要求上下文匹配,则通过相似度判断 - if not self.config.context_match_required: + if not self.config.node_merger_context_match_required: return True # 4. 检查上下文匹配 diff --git a/src/memory_graph/manager.py b/src/memory_graph/manager.py index 9c6ab4e94..5def3c2c9 100644 --- a/src/memory_graph/manager.py +++ b/src/memory_graph/manager.py @@ -15,6 +15,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Set, Tuple from src.config.config import global_config +from src.config.official_configs import MemoryConfig from src.memory_graph.core.builder import MemoryBuilder from src.memory_graph.core.extractor import MemoryExtractor from src.memory_graph.models import Memory, MemoryEdge, MemoryNode, MemoryType, NodeType, EdgeType @@ -53,7 +54,7 @@ class MemoryManager: if not global_config.memory or not getattr(global_config.memory, 'enable', False): raise ValueError("记忆系统未启用,请在配置文件中启用 [memory] enable = true") - self.config = global_config.memory + self.config: MemoryConfig = global_config.memory self.data_dir = data_dir or Path(getattr(self.config, 'data_dir', 'data/memory_graph')) # 存储组件 @@ -132,12 +133,16 @@ class MemoryManager: embedding_generator=self.embedding_generator, ) + # 检查配置值 + expand_depth = self.config.search_max_expand_depth + logger.info(f"📊 配置检查: search_max_expand_depth={expand_depth}") + self.tools = MemoryTools( vector_store=self.vector_store, graph_store=self.graph_store, persistence_manager=self.persistence, embedding_generator=self.embedding_generator, - max_expand_depth=getattr(self.config, 'search_max_expand_depth', 1), # 从配置读取默认深度 + max_expand_depth=expand_depth, # 从配置读取图扩展深度 ) self._initialized = True @@ -433,7 +438,7 @@ class MemoryManager: min_importance: float = 0.0, include_forgotten: bool = False, use_multi_query: bool = True, - expand_depth: int = 1, + expand_depth: int | None = None, context: Optional[Dict[str, Any]] = None, ) -> List[Memory]: """ @@ -468,7 +473,7 @@ class MemoryManager: "query": query, "top_k": top_k, "use_multi_query": use_multi_query, - "expand_depth": expand_depth, # 传递图扩展深度 + "expand_depth": expand_depth or global_config.memory.search_max_expand_depth, # 传递图扩展深度 "context": context, } diff --git a/src/memory_graph/tools/memory_tools.py b/src/memory_graph/tools/memory_tools.py index e8b600540..798a36268 100644 --- a/src/memory_graph/tools/memory_tools.py +++ b/src/memory_graph/tools/memory_tools.py @@ -51,6 +51,8 @@ class MemoryTools: self.persistence_manager = persistence_manager self._initialized = False self.max_expand_depth = max_expand_depth # 保存配置的默认值 + + logger.info(f"MemoryTools 初始化: max_expand_depth={max_expand_depth}") # 初始化组件 self.extractor = MemoryExtractor() From 023fab73a564cd5ad59aa5056610161b05758d43 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Thu, 6 Nov 2025 23:56:18 +0800 Subject: [PATCH 4/4] =?UTF-8?q?feat(memory):=20=E6=9B=B4=E6=96=B0=E8=AE=B0?= =?UTF-8?q?=E5=BF=86=E7=AE=A1=E7=90=86=E5=92=8C=E9=85=8D=E7=BD=AE=EF=BC=8C?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=95=B4=E5=90=88=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=AF=AD=E4=B9=89=E7=9B=B8=E4=BC=BC=E5=BA=A6?= =?UTF-8?q?=E9=98=88=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/message_receive/storage.py | 27 +- src/config/official_configs.py | 25 +- src/memory_graph/manager.py | 627 ++++++++++++++++++++----- src/memory_graph/tools/memory_tools.py | 11 +- template/bot_config_template.toml | 25 +- 5 files changed, 579 insertions(+), 136 deletions(-) diff --git a/src/chat/message_receive/storage.py b/src/chat/message_receive/storage.py index 73fccdf97..2aa5562c1 100644 --- a/src/chat/message_receive/storage.py +++ b/src/chat/message_receive/storage.py @@ -182,12 +182,14 @@ class MessageStorageBatcher: is_command = message.is_command or False is_public_notice = message.is_public_notice or False notice_type = message.notice_type - actions = message.actions + # 序列化actions列表为JSON字符串 + actions = orjson.dumps(message.actions).decode("utf-8") if message.actions else None should_reply = message.should_reply should_act = message.should_act additional_config = message.additional_config - key_words = "" - key_words_lite = "" + # 确保关键词字段是字符串格式(如果不是,则序列化) + key_words = MessageStorage._serialize_keywords(message.key_words) + key_words_lite = MessageStorage._serialize_keywords(message.key_words_lite) memorized_times = 0 user_platform = message.user_info.platform if message.user_info else "" @@ -254,7 +256,8 @@ class MessageStorageBatcher: is_command = message.is_command is_public_notice = getattr(message, "is_public_notice", False) notice_type = getattr(message, "notice_type", None) - actions = getattr(message, "actions", None) + # 序列化actions列表为JSON字符串 + actions = orjson.dumps(getattr(message, "actions", None)).decode("utf-8") if getattr(message, "actions", None) else None should_reply = getattr(message, "should_reply", None) should_act = getattr(message, "should_act", None) additional_config = getattr(message, "additional_config", None) @@ -580,6 +583,11 @@ class MessageStorage: is_picid = False is_notify = False is_command = False + is_public_notice = False + notice_type = None + actions = None + should_reply = False + should_act = False key_words = "" key_words_lite = "" else: @@ -593,6 +601,12 @@ class MessageStorage: is_picid = message.is_picid is_notify = message.is_notify is_command = message.is_command + is_public_notice = getattr(message, "is_public_notice", False) + notice_type = getattr(message, "notice_type", None) + # 序列化actions列表为JSON字符串 + actions = orjson.dumps(getattr(message, "actions", None)).decode("utf-8") if getattr(message, "actions", None) else None + should_reply = getattr(message, "should_reply", False) + should_act = getattr(message, "should_act", False) # 序列化关键词列表为JSON字符串 key_words = MessageStorage._serialize_keywords(message.key_words) key_words_lite = MessageStorage._serialize_keywords(message.key_words_lite) @@ -666,6 +680,11 @@ class MessageStorage: is_picid=is_picid, is_notify=is_notify, is_command=is_command, + is_public_notice=is_public_notice, + notice_type=notice_type, + actions=actions, + should_reply=should_reply, + should_act=should_act, key_words=key_words, key_words_lite=key_words_lite, ) diff --git a/src/config/official_configs.py b/src/config/official_configs.py index a8aa295ed..8be21a3eb 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -416,6 +416,7 @@ class MemoryConfig(ValidatedConfigBase): search_min_importance: float = Field(default=0.3, description="最小重要性阈值") search_similarity_threshold: float = Field(default=0.5, description="向量相似度阈值") search_max_expand_depth: int = Field(default=2, description="检索时图扩展深度(0-3)") + search_expand_semantic_threshold: float = Field(default=0.3, description="图扩展时语义相似度阈值(建议0.3-0.5,过低可能引入无关记忆,过高无法扩展)") enable_query_optimization: bool = Field(default=True, description="启用查询优化") # 检索权重配置 (记忆图系统) @@ -426,15 +427,21 @@ class MemoryConfig(ValidatedConfigBase): # 记忆整合配置 consolidation_enabled: bool = Field(default=False, description="是否启用记忆整合") - consolidation_interval_hours: float = Field(default=6.0, description="整合任务执行间隔(小时)") - consolidation_similarity_threshold: float = Field(default=0.92, description="相似记忆去重阈值") - consolidation_time_window_hours: float = Field(default=6.0, description="整合时间窗口(小时)") - consolidation_max_batch_size: int = Field(default=50, description="单次最多处理的记忆数量") - - # 自动关联配置 - auto_link_enabled: bool = Field(default=True, description="是否启用自动关联") - auto_link_max_candidates: int = Field(default=5, description="每个记忆最多关联候选数") - auto_link_min_confidence: float = Field(default=0.7, description="最低置信度阈值") + consolidation_interval_hours: float = Field(default=2.0, description="整合任务执行间隔(小时)") + consolidation_deduplication_threshold: float = Field(default=0.93, description="相似记忆去重阈值") + consolidation_time_window_hours: float = Field(default=2.0, description="整合时间窗口(小时)- 统一用于去重和关联") + consolidation_max_batch_size: int = Field(default=30, description="单次最多处理的记忆数量") + + # 记忆关联配置(整合功能的子模块) + consolidation_linking_enabled: bool = Field(default=True, description="是否启用记忆关联建立") + consolidation_linking_max_candidates: int = Field(default=10, description="每个记忆最多关联的候选数") + consolidation_linking_max_memories: int = Field(default=20, description="单次最多处理的记忆总数") + consolidation_linking_min_importance: float = Field(default=0.5, description="最低重要性阈值") + consolidation_linking_pre_filter_threshold: float = Field(default=0.7, description="向量相似度预筛选阈值") + consolidation_linking_max_pairs_for_llm: int = Field(default=5, description="最多发送给LLM分析的候选对数") + consolidation_linking_min_confidence: float = Field(default=0.7, description="LLM分析最低置信度阈值") + consolidation_linking_llm_temperature: float = Field(default=0.2, description="LLM分析温度参数") + consolidation_linking_llm_max_tokens: int = Field(default=1500, description="LLM分析最大输出长度") # 遗忘配置 (记忆图系统) forgetting_enabled: bool = Field(default=True, description="是否启用自动遗忘") diff --git a/src/memory_graph/manager.py b/src/memory_graph/manager.py index 5def3c2c9..142fe101a 100644 --- a/src/memory_graph/manager.py +++ b/src/memory_graph/manager.py @@ -25,6 +25,10 @@ from src.memory_graph.storage.vector_store import VectorStore from src.memory_graph.tools.memory_tools import MemoryTools from src.memory_graph.utils.embeddings import EmbeddingGenerator import uuid +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import numpy as np logger = logging.getLogger(__name__) @@ -135,14 +139,16 @@ class MemoryManager: # 检查配置值 expand_depth = self.config.search_max_expand_depth - logger.info(f"📊 配置检查: search_max_expand_depth={expand_depth}") - + expand_semantic_threshold = self.config.search_expand_semantic_threshold + logger.info(f"📊 配置检查: search_max_expand_depth={expand_depth}, search_expand_semantic_threshold={expand_semantic_threshold}") + self.tools = MemoryTools( vector_store=self.vector_store, graph_store=self.graph_store, persistence_manager=self.persistence, embedding_generator=self.embedding_generator, max_expand_depth=expand_depth, # 从配置读取图扩展深度 + expand_semantic_threshold=expand_semantic_threshold, # 从配置读取图扩展语义阈值 ) self._initialized = True @@ -977,135 +983,227 @@ class MemoryManager: ) -> Dict[str, Any]: """ 整理记忆:直接合并去重相似记忆(不创建新边) - - 优化点: - 1. 添加批量限制,避免长时间阻塞 - 2. 相似记忆直接覆盖合并,不创建关联边 - 3. 使用 asyncio.sleep 让出控制权,避免阻塞事件循环 - + + 性能优化版本: + 1. 使用 asyncio.create_task 在后台执行,避免阻塞主流程 + 2. 向量计算批量处理,减少重复计算 + 3. 延迟保存,批量写入数据库 + 4. 更频繁的协作式多任务让出 + Args: similarity_threshold: 相似度阈值(默认0.85,建议提高到0.9减少误判) time_window_hours: 时间窗口(小时) max_batch_size: 单次最多处理的记忆数量 - + Returns: - 整理结果 + 整理结果(如果是异步执行,返回启动状态) """ if not self._initialized: await self.initialize() try: - logger.info(f"开始记忆整理 (similarity_threshold={similarity_threshold}, time_window={time_window_hours}h, max_batch={max_batch_size})...") - + logger.info(f"🚀 启动记忆整理任务 (similarity_threshold={similarity_threshold}, time_window={time_window_hours}h, max_batch={max_batch_size})...") + + # 创建后台任务执行整理 + task = asyncio.create_task( + self._consolidate_memories_background( + similarity_threshold=similarity_threshold, + time_window_hours=time_window_hours, + max_batch_size=max_batch_size + ) + ) + + # 返回任务启动状态,不等待完成 + return { + "task_started": True, + "task_id": id(task), + "message": "记忆整理任务已在后台启动" + } + + except Exception as e: + logger.error(f"启动记忆整理任务失败: {e}", exc_info=True) + return {"error": str(e), "task_started": False} + + async def _consolidate_memories_background( + self, + similarity_threshold: float, + time_window_hours: float, + max_batch_size: int, + ) -> None: + """ + 后台执行记忆整理的具体实现 + + 这个方法会在独立任务中运行,不阻塞主流程 + """ + try: result = { "merged_count": 0, "checked_count": 0, "skipped_count": 0, } - + # 获取最近创建的记忆 cutoff_time = datetime.now() - timedelta(hours=time_window_hours) all_memories = self.graph_store.get_all_memories() - + recent_memories = [ mem for mem in all_memories if mem.created_at >= cutoff_time and not mem.metadata.get("forgotten", False) ] - + if not recent_memories: - logger.info("没有需要整理的记忆") - return result - + logger.info("✅ 记忆整理完成: 没有需要整理的记忆") + return + # 限制批量处理数量 if len(recent_memories) > max_batch_size: - logger.info(f"记忆数量 {len(recent_memories)} 超过批量限制 {max_batch_size},仅处理最新的 {max_batch_size} 条") + logger.info(f"📊 记忆数量 {len(recent_memories)} 超过批量限制 {max_batch_size},仅处理最新的 {max_batch_size} 条") recent_memories = sorted(recent_memories, key=lambda m: m.created_at, reverse=True)[:max_batch_size] result["skipped_count"] = len(all_memories) - max_batch_size - - logger.info(f"找到 {len(recent_memories)} 条待整理记忆") + + logger.info(f"📋 找到 {len(recent_memories)} 条待整理记忆") result["checked_count"] = len(recent_memories) - - # 按记忆类型分组 + + # 按记忆类型分组,减少跨类型比较 memories_by_type: Dict[str, List[Memory]] = {} for mem in recent_memories: mem_type = mem.metadata.get("memory_type", "") if mem_type not in memories_by_type: memories_by_type[mem_type] = [] memories_by_type[mem_type].append(mem) - - # 记录已删除的记忆ID,避免重复处理 + + # 记录需要删除的记忆,延迟批量删除 + to_delete: List[Tuple[Memory, str]] = [] # (memory, reason) deleted_ids = set() - + # 对每个类型的记忆进行相似度检测 for mem_type, memories in memories_by_type.items(): if len(memories) < 2: continue - - logger.debug(f"检查类型 '{mem_type}' 的 {len(memories)} 条记忆") - - # 使用向量相似度检测 - for i in range(len(memories)): - # 让出控制权,避免长时间阻塞 - if i % 10 == 0: - await asyncio.sleep(0) - - if memories[i].id in deleted_ids: + + logger.debug(f"🔍 检查类型 '{mem_type}' 的 {len(memories)} 条记忆") + + # 预提取所有主题节点的嵌入向量 + embeddings_map: Dict[str, "np.ndarray"] = {} + valid_memories = [] + + for mem in memories: + topic_node = next((n for n in mem.nodes if n.node_type == NodeType.TOPIC), None) + if topic_node and topic_node.embedding is not None: + embeddings_map[mem.id] = topic_node.embedding + valid_memories.append(mem) + + # 批量计算相似度矩阵(比逐个计算更高效) + import numpy as np + + for i in range(len(valid_memories)): + # 更频繁的协作式多任务让出 + if i % 5 == 0: + await asyncio.sleep(0.001) # 1ms让出 + + mem_i = valid_memories[i] + if mem_i.id in deleted_ids: continue - - for j in range(i + 1, len(memories)): - if memories[j].id in deleted_ids: + + for j in range(i + 1, len(valid_memories)): + if valid_memories[j].id in deleted_ids: continue - - mem_i = memories[i] - mem_j = memories[j] - - # 获取主题节点的向量 - topic_i = next((n for n in mem_i.nodes if n.node_type == NodeType.TOPIC), None) - topic_j = next((n for n in mem_j.nodes if n.node_type == NodeType.TOPIC), None) - - if not topic_i or not topic_j: - continue - - if topic_i.embedding is None or topic_j.embedding is None: - continue - - # 计算余弦相似度 - import numpy as np - similarity = np.dot(topic_i.embedding, topic_j.embedding) / ( - np.linalg.norm(topic_i.embedding) * np.linalg.norm(topic_j.embedding) - ) - + + mem_j = valid_memories[j] + + # 快速向量相似度计算 + embedding_i = embeddings_map[mem_i.id] + embedding_j = embeddings_map[mem_j.id] + + # 优化的余弦相似度计算 + similarity = self._fast_cosine_similarity(embedding_i, embedding_j) + if similarity >= similarity_threshold: - # 直接去重:保留重要性高的,删除另一个(不创建关联边) + # 决定保留哪个记忆 if mem_i.importance >= mem_j.importance: keep_mem, remove_mem = mem_i, mem_j else: keep_mem, remove_mem = mem_j, mem_i - - logger.info( - f"去重相似记忆 (similarity={similarity:.3f}): " - f"保留 {keep_mem.id}, 删除 {remove_mem.id}" + + logger.debug( + f"🔄 标记相似记忆 (similarity={similarity:.3f}): " + f"保留 {keep_mem.id[:8]}, 删除 {remove_mem.id[:8]}" ) - - # 增强保留记忆的重要性(合并信息价值) + + # 增强保留记忆的重要性 keep_mem.importance = min(1.0, keep_mem.importance + 0.05) - keep_mem.activation = min(1.0, keep_mem.activation + 0.05) - - # 将被删除记忆的访问次数累加到保留记忆 + + # 累加访问次数 if hasattr(keep_mem, 'access_count') and hasattr(remove_mem, 'access_count'): keep_mem.access_count += remove_mem.access_count - - # 直接删除相似记忆(不创建边,简化图结构) - await self.delete_memory(remove_mem.id) + + # 标记为待删除(不立即删除) + to_delete.append((remove_mem, f"与记忆 {keep_mem.id[:8]} 相似度 {similarity:.3f}")) deleted_ids.add(remove_mem.id) result["merged_count"] += 1 - - logger.info(f"记忆整理完成: {result}") - return result - + + # 每处理完一个类型就让出控制权 + await asyncio.sleep(0.005) # 5ms让出 + + # 批量删除标记的记忆 + if to_delete: + logger.info(f"🗑️ 开始批量删除 {len(to_delete)} 条相似记忆") + + for memory, reason in to_delete: + try: + # 从向量存储删除节点 + for node in memory.nodes: + if node.embedding is not None: + await self.vector_store.delete_node(node.id) + + # 从图存储删除记忆 + self.graph_store.remove_memory(memory.id) + + except Exception as e: + logger.warning(f"删除记忆 {memory.id[:8]} 失败: {e}") + + # 批量保存(一次性写入,减少I/O) + await self.persistence.save_graph_store(self.graph_store) + logger.info(f"💾 批量保存完成") + + logger.info(f"✅ 记忆整理完成: {result}") + except Exception as e: - logger.error(f"记忆整理失败: {e}", exc_info=True) - return {"error": str(e), "merged_count": 0, "checked_count": 0} + logger.error(f"❌ 记忆整理失败: {e}", exc_info=True) + + def _fast_cosine_similarity(self, vec1: "np.ndarray", vec2: "np.ndarray") -> float: + """ + 快速余弦相似度计算(优化版本) + + Args: + vec1, vec2: 向量 + + Returns: + 余弦相似度 + """ + try: + import numpy as np + + # 避免重复的类型检查和转换 + # 向量应该是numpy数组,如果不是,转换一次 + if not isinstance(vec1, np.ndarray): + vec1 = np.asarray(vec1, dtype=np.float32) + if not isinstance(vec2, np.ndarray): + vec2 = np.asarray(vec2, dtype=np.float32) + + # 使用更高效的范数计算 + norm1 = np.linalg.norm(vec1) + norm2 = np.linalg.norm(vec2) + + if norm1 == 0 or norm2 == 0: + return 0.0 + + # 直接计算点积和除法 + return float(np.dot(vec1, vec2) / (norm1 * norm2)) + + except Exception as e: + logger.warning(f"计算余弦相似度失败: {e}") + return 0.0 async def auto_link_memories( self, @@ -1478,14 +1576,14 @@ class MemoryManager: async def maintenance(self) -> Dict[str, Any]: """ - 执行维护任务 - + 执行维护任务(优化版本) + 包括: - - 记忆整理(合并相似记忆) - - 清理过期记忆 + - 记忆整理(异步后台执行) + - 自动关联记忆(轻量级执行) - 自动遗忘低激活度记忆 - 保存数据 - + Returns: 维护结果 """ @@ -1493,52 +1591,355 @@ class MemoryManager: await self.initialize() try: - logger.info("开始执行记忆系统维护...") - + logger.info("🔧 开始执行记忆系统维护(优化版)...") + result = { - "consolidated": 0, + "consolidation_task": "none", + "linked": 0, "forgotten": 0, - "deleted": 0, "saved": False, + "total_time": 0, } - - # 1. 记忆整理(合并相似记忆) - # 默认禁用自动整理,因为可能阻塞主流程 - # 建议:提高阈值到0.92以上,减少误判;限制批量大小避免阻塞 + + start_time = datetime.now() + + # 1. 记忆整理(异步后台执行,不阻塞主流程) if getattr(self.config, 'consolidation_enabled', False): + logger.info("🚀 启动异步记忆整理任务...") consolidate_result = await self.consolidate_memories( - similarity_threshold=getattr(self.config, 'consolidation_similarity_threshold', 0.92), - time_window_hours=getattr(self.config, 'consolidation_time_window_hours', 24.0), - max_batch_size=getattr(self.config, 'consolidation_max_batch_size', 50) + similarity_threshold=getattr(self.config, 'consolidation_deduplication_threshold', 0.93), + time_window_hours=getattr(self.config, 'consolidation_time_window_hours', 2.0), # 统一时间窗口 + max_batch_size=getattr(self.config, 'consolidation_max_batch_size', 30) ) - result["consolidated"] = consolidate_result.get("merged_count", 0) - - # 2. 自动关联记忆(发现和建立关系) - if getattr(self.config, 'auto_link_enabled', True): - link_result = await self.auto_link_memories() + + if consolidate_result.get("task_started"): + result["consolidation_task"] = f"background_task_{consolidate_result.get('task_id', 'unknown')}" + logger.info("✅ 记忆整理任务已启动到后台执行") + else: + result["consolidation_task"] = "failed" + logger.warning("❌ 记忆整理任务启动失败") + + # 2. 自动关联记忆(使用统一的时间窗口) + if getattr(self.config, 'consolidation_linking_enabled', True): + logger.info("🔗 执行轻量级自动关联...") + link_result = await self._lightweight_auto_link_memories() result["linked"] = link_result.get("linked_count", 0) - - # 3. 自动遗忘 + + # 3. 自动遗忘(快速执行) if getattr(self.config, 'forgetting_enabled', True): + logger.info("🗑️ 执行自动遗忘...") forgotten_count = await self.auto_forget_memories( threshold=getattr(self.config, 'forgetting_activation_threshold', 0.1) ) result["forgotten"] = forgotten_count - - # 4. 清理非常旧的已遗忘记忆(可选) - # TODO: 实现清理逻辑 - - # 5. 保存数据 - await self.persistence.save_graph_store(self.graph_store) - result["saved"] = True - + + # 4. 保存数据(如果记忆整理不在后台执行) + if result["consolidation_task"] == "none": + await self.persistence.save_graph_store(self.graph_store) + result["saved"] = True + logger.info("💾 数据保存完成") + self._last_maintenance = datetime.now() - logger.info(f"维护完成: {result}") + + # 计算维护耗时 + total_time = (datetime.now() - start_time).total_seconds() + result["total_time"] = total_time + + logger.info(f"✅ 维护完成 (耗时 {total_time:.2f}s): {result}") return result - + except Exception as e: - logger.error(f"维护失败: {e}", exc_info=True) - return {"error": str(e)} + logger.error(f"❌ 维护失败: {e}", exc_info=True) + return {"error": str(e), "total_time": 0} + + async def _lightweight_auto_link_memories( + self, + time_window_hours: float = None, # 从配置读取 + max_candidates: int = None, # 从配置读取 + max_memories: int = None, # 从配置读取 + ) -> Dict[str, Any]: + """ + 智能轻量级自动关联记忆(保留LLM判断,优化性能) + + 优化策略: + 1. 从配置读取处理参数,尊重用户设置 + 2. 使用向量相似度预筛选,仅对高相似度记忆调用LLM + 3. 批量LLM调用,减少网络开销 + 4. 异步执行,避免阻塞 + """ + try: + result = { + "checked_count": 0, + "linked_count": 0, + "llm_calls": 0, + } + + # 从配置读取参数,使用统一的时间窗口 + if time_window_hours is None: + time_window_hours = getattr(self.config, 'consolidation_time_window_hours', 2.0) + if max_candidates is None: + max_candidates = getattr(self.config, 'consolidation_linking_max_candidates', 10) + if max_memories is None: + max_memories = getattr(self.config, 'consolidation_linking_max_memories', 20) + + # 获取用户配置时间窗口内的记忆 + time_threshold = datetime.now() - timedelta(hours=time_window_hours) + all_memories = self.graph_store.get_all_memories() + + recent_memories = [ + mem for mem in all_memories + if mem.created_at >= time_threshold + and not mem.metadata.get("forgotten", False) + and mem.importance >= getattr(self.config, 'consolidation_linking_min_importance', 0.5) # 从配置读取重要性阈值 + ] + + if len(recent_memories) > max_memories: + recent_memories = sorted(recent_memories, key=lambda m: m.created_at, reverse=True)[:max_memories] + + if len(recent_memories) < 2: + logger.debug("记忆数量不足,跳过智能关联") + return result + + logger.debug(f"🧠 智能关联: 检查 {len(recent_memories)} 条重要记忆") + + # 第一步:向量相似度预筛选,找到潜在关联对 + candidate_pairs = [] + import numpy as np + + for i, memory in enumerate(recent_memories): + # 获取主题节点 + topic_node = next( + (n for n in memory.nodes if n.node_type == NodeType.TOPIC), + None + ) + + if not topic_node or topic_node.embedding is None: + continue + + # 与其他记忆计算相似度 + for j, other_memory in enumerate(recent_memories[i+1:], i+1): + other_topic = next( + (n for n in other_memory.nodes if n.node_type == NodeType.TOPIC), + None + ) + + if not other_topic or other_topic.embedding is None: + continue + + # 快速相似度计算 + similarity = self._fast_cosine_similarity( + topic_node.embedding, + other_topic.embedding + ) + + # 使用配置的预筛选阈值 + pre_filter_threshold = getattr(self.config, 'consolidation_linking_pre_filter_threshold', 0.7) + if similarity >= pre_filter_threshold: + candidate_pairs.append((memory, other_memory, similarity)) + + # 让出控制权 + if i % 3 == 0: + await asyncio.sleep(0.001) + + logger.debug(f"🔍 预筛选找到 {len(candidate_pairs)} 个候选关联对") + + if not candidate_pairs: + return result + + # 第二步:批量LLM分析(使用配置的最大候选对数) + max_pairs_for_llm = getattr(self.config, 'consolidation_linking_max_pairs_for_llm', 5) + if len(candidate_pairs) <= max_pairs_for_llm: + link_relations = await self._batch_analyze_memory_relations(candidate_pairs) + result["llm_calls"] = 1 + + # 第三步:建立LLM确认的关联 + for relation_info in link_relations: + try: + memory_a, memory_b = relation_info["memory_pair"] + relation_type = relation_info["relation_type"] + confidence = relation_info["confidence"] + + # 创建关联边 + edge = MemoryEdge( + id=f"smart_edge_{uuid.uuid4().hex[:12]}", + source_id=memory_a.subject_id, + target_id=memory_b.subject_id, + relation=relation_type, + edge_type=EdgeType.RELATION, + importance=confidence, + metadata={ + "auto_linked": True, + "method": "llm_analyzed", + "vector_similarity": relation_info.get("vector_similarity", 0.0), + "confidence": confidence, + "reasoning": relation_info.get("reasoning", ""), + "created_at": datetime.now().isoformat(), + } + ) + + # 添加到图 + self.graph_store.graph.add_edge( + edge.source_id, + edge.target_id, + edge_id=edge.id, + relation=edge.relation, + edge_type=edge.edge_type.value, + importance=edge.importance, + metadata=edge.metadata, + ) + + memory_a.edges.append(edge) + result["linked_count"] += 1 + + logger.debug(f"🧠 智能关联: {memory_a.id[:8]} --[{relation_type}]--> {memory_b.id[:8]} (置信度={confidence:.2f})") + + except Exception as e: + logger.warning(f"建立智能关联失败: {e}") + continue + + # 保存关联结果 + if result["linked_count"] > 0: + await self.persistence.save_graph_store(self.graph_store) + + logger.debug(f"✅ 智能关联完成: 建立了 {result['linked_count']} 个关联,LLM调用 {result['llm_calls']} 次") + return result + + except Exception as e: + logger.error(f"智能关联失败: {e}", exc_info=True) + return {"error": str(e), "checked_count": 0, "linked_count": 0} + + async def _batch_analyze_memory_relations( + self, + candidate_pairs: List[Tuple[Memory, Memory, float]] + ) -> List[Dict[str, Any]]: + """ + 批量分析记忆关系(优化LLM调用) + + Args: + candidate_pairs: 候选记忆对列表,每项包含 (memory_a, memory_b, vector_similarity) + + Returns: + 关系分析结果列表 + """ + try: + from src.llm_models.utils_model import LLMRequest + from src.config.config import model_config + + llm = LLMRequest( + model_set=model_config.model_task_config.utils_small, + request_type="memory.batch_relation_analysis" + ) + + # 格式化所有候选记忆对 + candidates_text = "" + for i, (mem_a, mem_b, similarity) in enumerate(candidate_pairs): + desc_a = self._format_memory_for_llm(mem_a) + desc_b = self._format_memory_for_llm(mem_b) + candidates_text += f""" +候选对 {i+1}: +记忆A: {desc_a} +记忆B: {desc_b} +向量相似度: {similarity:.3f} +""" + + # 构建批量分析提示词(使用配置的置信度阈值) + min_confidence = getattr(self.config, 'consolidation_linking_min_confidence', 0.7) + + prompt = f"""你是记忆关系分析专家。请批量分析以下候选记忆对之间的关系。 + +**关系类型说明:** +- 导致: A的发生导致了B的发生(因果关系) +- 引用: A提到或涉及B(引用关系) +- 相似: A和B描述相似的内容(相似关系) +- 相反: A和B表达相反的观点(对立关系) +- 关联: A和B存在某种关联但不属于以上类型(一般关联) + +**候选记忆对:** +{candidates_text} + +**任务要求:** +1. 对每个候选对,判断是否存在有意义的关系 +2. 如果存在关系,指定关系类型和置信度(0.0-1.0) +3. 简要说明判断理由 +4. 只返回置信度 >= {min_confidence} 的关系 +5. 优先考虑因果、引用等强关系,谨慎建立相似关系 + +**输出格式(JSON):** +```json +[ + {{ + "candidate_id": 1, + "has_relation": true, + "relation_type": "导致", + "confidence": 0.85, + "reasoning": "记忆A描述的原因导致记忆B的结果" + }}, + {{ + "candidate_id": 2, + "has_relation": false, + "reasoning": "两者无明显关联" + }} +] +``` + +请分析并输出JSON结果:""" + + # 调用LLM(使用配置的参数) + llm_temperature = getattr(self.config, 'consolidation_linking_llm_temperature', 0.2) + llm_max_tokens = getattr(self.config, 'consolidation_linking_llm_max_tokens', 1500) + + response, _ = await llm.generate_response_async( + prompt, + temperature=llm_temperature, + max_tokens=llm_max_tokens, + ) + + # 解析响应 + import json + import re + + # 提取JSON + json_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL) + if json_match: + json_str = json_match.group(1) + else: + json_str = response.strip() + + try: + analysis_results = json.loads(json_str) + except json.JSONDecodeError: + logger.warning(f"LLM返回格式错误,尝试修复: {response[:200]}") + # 尝试简单修复 + json_str = re.sub(r'[\r\n\t]', '', json_str) + analysis_results = json.loads(json_str) + + # 转换为结果格式 + relations = [] + for result in analysis_results: + if not result.get("has_relation", False): + continue + + confidence = result.get("confidence", 0.0) + if confidence < min_confidence: # 使用配置的置信度阈值 + continue + + candidate_id = result.get("candidate_id", 0) - 1 + if 0 <= candidate_id < len(candidate_pairs): + mem_a, mem_b, vector_similarity = candidate_pairs[candidate_id] + relations.append({ + "memory_pair": (mem_a, mem_b), + "relation_type": result.get("relation_type", "关联"), + "confidence": confidence, + "reasoning": result.get("reasoning", ""), + "vector_similarity": vector_similarity, + }) + + logger.debug(f"🧠 LLM批量分析完成: 发现 {len(relations)} 个关系") + return relations + + except Exception as e: + logger.error(f"LLM批量关系分析失败: {e}", exc_info=True) + return [] async def start_maintenance_scheduler(self) -> None: """ diff --git a/src/memory_graph/tools/memory_tools.py b/src/memory_graph/tools/memory_tools.py index 798a36268..9f0e431c9 100644 --- a/src/memory_graph/tools/memory_tools.py +++ b/src/memory_graph/tools/memory_tools.py @@ -35,24 +35,27 @@ class MemoryTools: persistence_manager: PersistenceManager, embedding_generator: Optional[EmbeddingGenerator] = None, max_expand_depth: int = 1, + expand_semantic_threshold: float = 0.3, ): """ 初始化工具集 - + Args: vector_store: 向量存储 graph_store: 图存储 persistence_manager: 持久化管理器 embedding_generator: 嵌入生成器(可选) max_expand_depth: 图扩展深度的默认值(从配置读取) + expand_semantic_threshold: 图扩展时语义相似度阈值(从配置读取) """ self.vector_store = vector_store self.graph_store = graph_store self.persistence_manager = persistence_manager self._initialized = False self.max_expand_depth = max_expand_depth # 保存配置的默认值 - - logger.info(f"MemoryTools 初始化: max_expand_depth={max_expand_depth}") + self.expand_semantic_threshold = expand_semantic_threshold # 保存配置的语义阈值 + + logger.info(f"MemoryTools 初始化: max_expand_depth={max_expand_depth}, expand_semantic_threshold={expand_semantic_threshold}") # 初始化组件 self.extractor = MemoryExtractor() @@ -507,7 +510,7 @@ class MemoryTools: initial_memory_ids=list(initial_memory_ids), query_embedding=query_embedding, max_depth=expand_depth, - semantic_threshold=0.5, + semantic_threshold=self.expand_semantic_threshold, # 使用配置的阈值 max_expanded=top_k * 2 ) diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 81f34c347..60fbfbb83 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -1,5 +1,5 @@ [inner] -version = "7.6.2" +version = "7.6.4" #----以下是给开发人员阅读的,如果你只是部署了MoFox-Bot,不需要阅读---- #如果你想要修改配置文件,请递增version的值 @@ -251,19 +251,32 @@ vector_db_path = "data/memory_graph/chroma_db" # 向量数据库路径 (使用 # === 记忆检索配置 === search_top_k = 10 # 默认检索返回数量 search_min_importance = 0.3 # 最小重要性阈值 (0.0-1.0) -search_similarity_threshold = 0.5 # 向量相似度阈值 +search_similarity_threshold = 0.6 # 向量相似度阈值 +search_expand_semantic_threshold = 0.3 # 图扩展时语义相似度阈值(建议0.3-0.5,过低可能引入无关记忆,过高无法扩展) # 智能查询优化 enable_query_optimization = true # 启用查询优化(使用小模型分析对话历史,生成综合性搜索查询) # === 记忆整合配置 === +# 记忆整合包含两个功能:1)去重(合并相似记忆)2)关联(建立记忆关系) # 注意:整合任务会遍历所有记忆进行相似度计算,可能占用较多资源 # 建议:1) 降低执行频率;2) 提高相似度阈值减少误判;3) 限制批量大小 consolidation_enabled = true # 是否启用记忆整合 consolidation_interval_hours = 1.0 # 整合任务执行间隔 -consolidation_similarity_threshold = 0.92 # 相似记忆去重阈值(建议>=0.92减少误判,0.85太低) -consolidation_time_window_hours = 6.0 # 整合时间窗口(小时) -consolidation_max_batch_size = 50 # 单次最多处理的记忆数量(限制批量避免阻塞) +consolidation_deduplication_threshold = 0.93 # 相似记忆去重阈值 +consolidation_time_window_hours = 2.0 # 整合时间窗口(小时)- 统一用于去重和关联 +consolidation_max_batch_size = 100 # 单次最多处理的记忆数量 + +# 记忆关联配置(整合功能的子模块) +consolidation_linking_enabled = true # 是否启用记忆关联建立 +consolidation_linking_max_candidates = 10 # 每个记忆最多关联的候选数 +consolidation_linking_max_memories = 20 # 单次最多处理的记忆总数 +consolidation_linking_min_importance = 0.5 # 最低重要性阈值(低于此值的记忆不参与关联) +consolidation_linking_pre_filter_threshold = 0.7 # 向量相似度预筛选阈值 +consolidation_linking_max_pairs_for_llm = 5 # 最多发送给LLM分析的候选对数 +consolidation_linking_min_confidence = 0.7 # LLM分析最低置信度阈值 +consolidation_linking_llm_temperature = 0.2 # LLM分析温度参数 +consolidation_linking_llm_max_tokens = 1500 # LLM分析最大输出长度 # === 记忆遗忘配置 === forgetting_enabled = true # 是否启用自动遗忘 @@ -273,7 +286,7 @@ forgetting_min_importance = 0.8 # 最小保护重要性(高于此值的记忆 # === 记忆激活配置 === activation_decay_rate = 0.9 # 激活度衰减率(每天衰减10%) activation_propagation_strength = 0.5 # 激活传播强度(传播到相关记忆的激活度比例) -activation_propagation_depth = 2 # 激活传播深度(最多传播几层,建议1-2) +activation_propagation_depth = 1 # 激活传播深度(最多传播几层,建议1-2) # === 记忆检索配置 === search_max_expand_depth = 2 # 检索时图扩展深度(0=仅直接匹配,1=扩展1跳,2=扩展2跳,推荐1-2)