better:海马体2.0升级-进度30%

This commit is contained in:
SengokuCola
2025-03-27 22:14:23 +08:00
parent 2812b0df3c
commit b474da3875
30 changed files with 433 additions and 2410 deletions

View File

@@ -6,19 +6,77 @@ import time
import re
import jieba
import networkx as nx
# from nonebot import get_driver
import numpy as np
from collections import Counter
from ...common.database import db
# from ..chat.config import global_config
from ..chat.utils import (
calculate_information_content,
cosine_similarity,
get_closest_chat_from_db,
)
from ..models.utils_model import LLM_request
from ...plugins.models.utils_model import LLM_request
from src.common.logger import get_module_logger, LogConfig, MEMORY_STYLE_CONFIG
from src.plugins.memory_system.sample_distribution import MemoryBuildScheduler #分布生成器
from .config import MemoryConfig
from .memory_config import MemoryConfig
def get_closest_chat_from_db(length: int, timestamp: str):
# print(f"获取最接近指定时间戳的聊天记录,长度: {length}, 时间戳: {timestamp}")
# print(f"当前时间: {timestamp},转换后时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))}")
chat_records = []
closest_record = db.messages.find_one({"time": {"$lte": timestamp}}, sort=[("time", -1)])
# print(f"最接近的记录: {closest_record}")
if closest_record:
closest_time = closest_record["time"]
chat_id = closest_record["chat_id"] # 获取chat_id
# 获取该时间戳之后的length条消息保持相同的chat_id
chat_records = list(
db.messages.find(
{
"time": {"$gt": closest_time},
"chat_id": chat_id, # 添加chat_id过滤
}
)
.sort("time", 1)
.limit(length)
)
# print(f"获取到的记录: {chat_records}")
length = len(chat_records)
# print(f"获取到的记录长度: {length}")
# 转换记录格式
formatted_records = []
for record in chat_records:
# 兼容行为,前向兼容老数据
formatted_records.append(
{
"_id": record["_id"],
"time": record["time"],
"chat_id": record["chat_id"],
"detailed_plain_text": record.get("detailed_plain_text", ""), # 添加文本内容
"memorized_times": record.get("memorized_times", 0), # 添加记忆次数
}
)
return formatted_records
return []
def calculate_information_content(text):
"""计算文本的信息量(熵)"""
char_count = Counter(text)
total_chars = len(text)
entropy = 0
for count in char_count.values():
probability = count / total_chars
entropy -= probability * math.log2(probability)
return entropy
def cosine_similarity(v1, v2):
"""计算余弦相似度"""
dot_product = np.dot(v1, v2)
norm1 = np.linalg.norm(v1)
norm2 = np.linalg.norm(v2)
if norm1 == 0 or norm2 == 0:
return 0
return dot_product / (norm1 * norm2)
# 定义日志配置
memory_config = LogConfig(
@@ -393,6 +451,59 @@ class EntorhinalCortex:
if need_update:
logger.success("[数据库] 已为缺失的时间字段进行补充")
async def resync_memory_to_db(self):
"""清空数据库并重新同步所有记忆数据"""
start_time = time.time()
logger.info("[数据库] 开始重新同步所有记忆数据...")
# 清空数据库
clear_start = time.time()
db.graph_data.nodes.delete_many({})
db.graph_data.edges.delete_many({})
clear_end = time.time()
logger.info(f"[数据库] 清空数据库耗时: {clear_end - clear_start:.2f}")
# 获取所有节点和边
memory_nodes = list(self.memory_graph.G.nodes(data=True))
memory_edges = list(self.memory_graph.G.edges(data=True))
# 重新写入节点
node_start = time.time()
for concept, data in memory_nodes:
memory_items = data.get("memory_items", [])
if not isinstance(memory_items, list):
memory_items = [memory_items] if memory_items else []
node_data = {
"concept": concept,
"memory_items": memory_items,
"hash": self.hippocampus.calculate_node_hash(concept, memory_items),
"created_time": data.get("created_time", datetime.datetime.now().timestamp()),
"last_modified": data.get("last_modified", datetime.datetime.now().timestamp()),
}
db.graph_data.nodes.insert_one(node_data)
node_end = time.time()
logger.info(f"[数据库] 写入 {len(memory_nodes)} 个节点耗时: {node_end - node_start:.2f}")
# 重新写入边
edge_start = time.time()
for source, target, data in memory_edges:
edge_data = {
"source": source,
"target": target,
"strength": data.get("strength", 1),
"hash": self.hippocampus.calculate_edge_hash(source, target),
"created_time": data.get("created_time", datetime.datetime.now().timestamp()),
"last_modified": data.get("last_modified", datetime.datetime.now().timestamp()),
}
db.graph_data.edges.insert_one(edge_data)
edge_end = time.time()
logger.info(f"[数据库] 写入 {len(memory_edges)} 条边耗时: {edge_end - edge_start:.2f}")
end_time = time.time()
logger.success(f"[数据库] 重新同步完成,总耗时: {end_time - start_time:.2f}")
logger.success(f"[数据库] 同步了 {len(memory_nodes)} 个节点和 {len(memory_edges)} 条边")
#负责整合,遗忘,合并记忆
class ParahippocampalGyrus:
def __init__(self, hippocampus):
@@ -582,7 +693,8 @@ class ParahippocampalGyrus:
"秒---------------------"
)
async def operation_forget_topic(self, percentage=0.1):
async def operation_forget_topic(self, percentage=0.005):
start_time = time.time()
logger.info("[遗忘] 开始检查数据库...")
all_nodes = list(self.memory_graph.G.nodes())
@@ -598,12 +710,20 @@ class ParahippocampalGyrus:
nodes_to_check = random.sample(all_nodes, check_nodes_count)
edges_to_check = random.sample(all_edges, check_edges_count)
edge_changes = {"weakened": 0, "removed": 0}
node_changes = {"reduced": 0, "removed": 0}
# 使用列表存储变化信息
edge_changes = {
"weakened": [], # 存储减弱的边
"removed": [] # 存储移除的边
}
node_changes = {
"reduced": [], # 存储减少记忆的节点
"removed": [] # 存储移除的节点
}
current_time = datetime.datetime.now().timestamp()
logger.info("[遗忘] 开始检查连接...")
edge_check_start = time.time()
for source, target in edges_to_check:
edge_data = self.memory_graph.G[source][target]
last_modified = edge_data.get("last_modified")
@@ -614,15 +734,16 @@ class ParahippocampalGyrus:
if new_strength <= 0:
self.memory_graph.G.remove_edge(source, target)
edge_changes["removed"] += 1
logger.info(f"[遗忘] 连接移除: {source} -> {target}")
edge_changes["removed"].append(f"{source} -> {target}")
else:
edge_data["strength"] = new_strength
edge_data["last_modified"] = current_time
edge_changes["weakened"] += 1
logger.info(f"[遗忘] 连接减弱: {source} -> {target} (强度: {current_strength} -> {new_strength})")
edge_changes["weakened"].append(f"{source}-{target} (强度: {current_strength} -> {new_strength})")
edge_check_end = time.time()
logger.info(f"[遗忘] 连接检查耗时: {edge_check_end - edge_check_start:.2f}")
logger.info("[遗忘] 开始检查节点...")
node_check_start = time.time()
for node in nodes_to_check:
node_data = self.memory_graph.G.nodes[node]
last_modified = node_data.get("last_modified", current_time)
@@ -640,21 +761,40 @@ class ParahippocampalGyrus:
if memory_items:
self.memory_graph.G.nodes[node]["memory_items"] = memory_items
self.memory_graph.G.nodes[node]["last_modified"] = current_time
node_changes["reduced"] += 1
logger.info(f"[遗忘] 记忆减少: {node} (数量: {current_count} -> {len(memory_items)})")
node_changes["reduced"].append(f"{node} (数量: {current_count} -> {len(memory_items)})")
else:
self.memory_graph.G.remove_node(node)
node_changes["removed"] += 1
logger.info(f"[遗忘] 节点移除: {node}")
node_changes["removed"].append(node)
node_check_end = time.time()
logger.info(f"[遗忘] 节点检查耗时: {node_check_end - node_check_start:.2f}")
if any(count > 0 for count in edge_changes.values()) or any(count > 0 for count in node_changes.values()):
await self.hippocampus.entorhinal_cortex.sync_memory_to_db()
logger.info("[遗忘] 统计信息:")
logger.info(f"[遗忘] 连接变化: {edge_changes['weakened']} 个减弱, {edge_changes['removed']} 个移除")
logger.info(f"[遗忘] 节点变化: {node_changes['reduced']} 个减少记忆, {node_changes['removed']} 个移除")
if any(edge_changes.values()) or any(node_changes.values()):
sync_start = time.time()
await self.hippocampus.entorhinal_cortex.resync_memory_to_db()
sync_end = time.time()
logger.info(f"[遗忘] 数据库同步耗时: {sync_end - sync_start:.2f}")
# 汇总输出所有变化
logger.info("[遗忘] 遗忘操作统计:")
if edge_changes["weakened"]:
logger.info(f"[遗忘] 减弱的连接 ({len(edge_changes['weakened'])}个): {', '.join(edge_changes['weakened'])}")
if edge_changes["removed"]:
logger.info(f"[遗忘] 移除的连接 ({len(edge_changes['removed'])}个): {', '.join(edge_changes['removed'])}")
if node_changes["reduced"]:
logger.info(f"[遗忘] 减少记忆的节点 ({len(node_changes['reduced'])}个): {', '.join(node_changes['reduced'])}")
if node_changes["removed"]:
logger.info(f"[遗忘] 移除的节点 ({len(node_changes['removed'])}个): {', '.join(node_changes['removed'])}")
else:
logger.info("[遗忘] 本次检查没有节点或连接满足遗忘条件")
end_time = time.time()
logger.info(f"[遗忘] 总耗时: {end_time - start_time:.2f}")
# 海马体
class Hippocampus:
def __init__(self):
@@ -696,7 +836,7 @@ class Hippocampus:
prompt = (
f"这是一段文字:{text}。请你从这段话中总结出最多{topic_num}个关键的概念,可以是名词,动词,或者特定人物,帮我列出来,"
f"将主题用逗号隔开,并加上<>,例如<主题1>,<主题2>......尽可能精简。只需要列举最多{topic_num}个话题就好,不要有序号,不要告诉我其他内容。"
f"如果找不出主题或者没有明显主题,返回<none>。"
f"如果确定找不出主题或者没有明显主题,返回<none>。"
)
return prompt
@@ -763,7 +903,7 @@ class Hippocampus:
memories.sort(key=lambda x: x[2], reverse=True)
return memories
async def get_memory_from_text(self, text: str, num: int = 5, max_depth: int = 2,
async def get_memory_from_text(self, text: str, num: int = 5, max_depth: int = 3,
fast_retrieval: bool = False) -> list:
"""从文本中提取关键词并获取相关记忆。
@@ -795,7 +935,8 @@ class Hippocampus:
keywords = keywords[:5]
else:
# 使用LLM提取关键词
topic_num = min(5, max(1, int(len(text) * 0.1))) # 根据文本长度动态调整关键词数量
topic_num = min(5, max(1, int(len(text) * 0.2))) # 根据文本长度动态调整关键词数量
print(f"提取关键词数量: {topic_num}")
topics_response = await self.llm_topic_judge.generate_response(
self.find_topic_llm(text, topic_num)
)
@@ -811,11 +952,84 @@ class Hippocampus:
if keyword.strip()
]
logger.info(f"提取的关键词: {', '.join(keywords)}")
# 从每个关键词获取记忆
all_memories = []
keyword_connections = [] # 存储关键词之间的连接关系
# 检查关键词之间的连接
for i in range(len(keywords)):
for j in range(i + 1, len(keywords)):
keyword1, keyword2 = keywords[i], keywords[j]
# 检查节点是否存在于图中
if keyword1 not in self.memory_graph.G or keyword2 not in self.memory_graph.G:
logger.debug(f"关键词 {keyword1}{keyword2} 不在记忆图中")
continue
# 检查直接连接
if self.memory_graph.G.has_edge(keyword1, keyword2):
keyword_connections.append((keyword1, keyword2, 1))
logger.info(f"发现直接连接: {keyword1} <-> {keyword2} (长度: 1)")
continue
# 检查间接连接(通过其他节点)
for depth in range(2, max_depth + 1):
# 使用networkx的shortest_path_length检查是否存在指定长度的路径
try:
path_length = nx.shortest_path_length(self.memory_graph.G, keyword1, keyword2)
if path_length <= depth:
keyword_connections.append((keyword1, keyword2, path_length))
logger.info(f"发现间接连接: {keyword1} <-> {keyword2} (长度: {path_length})")
# 输出连接路径
path = nx.shortest_path(self.memory_graph.G, keyword1, keyword2)
logger.info(f"连接路径: {' -> '.join(path)}")
break
except nx.NetworkXNoPath:
continue
if not keyword_connections:
logger.info("未发现任何关键词之间的连接")
# 记录已处理的关键词连接
processed_connections = set()
# 从每个关键词获取记忆
for keyword in keywords:
memories = self.get_memory_from_keyword(keyword, max_depth)
all_memories.extend(memories)
if keyword in self.memory_graph.G: # 只处理存在于图中的关键词
memories = self.get_memory_from_keyword(keyword, max_depth)
all_memories.extend(memories)
# 处理关键词连接相关的记忆
for keyword1, keyword2, path_length in keyword_connections:
if (keyword1, keyword2) in processed_connections or (keyword2, keyword1) in processed_connections:
continue
processed_connections.add((keyword1, keyword2))
# 获取连接路径上的所有节点
try:
path = nx.shortest_path(self.memory_graph.G, keyword1, keyword2)
for node in path:
if node not in keywords: # 只处理路径上的非关键词节点
node_data = self.memory_graph.G.nodes[node]
memory_items = node_data.get("memory_items", [])
if not isinstance(memory_items, list):
memory_items = [memory_items] if memory_items else []
# 计算与输入文本的相似度
node_words = set(jieba.cut(node))
text_words = set(jieba.cut(text))
all_words = node_words | text_words
v1 = [1 if word in node_words else 0 for word in all_words]
v2 = [1 if word in text_words else 0 for word in all_words]
similarity = cosine_similarity(v1, v2)
if similarity >= 0.3: # 相似度阈值
all_memories.append((node, memory_items, similarity))
except nx.NetworkXNoPath:
continue
# 去重(基于主题)
seen_topics = set()
@@ -871,6 +1085,16 @@ class HippocampusManager:
logger.success(f"记忆构建分布: {config.memory_build_distribution}")
logger.success("--------------------------------")
# 输出记忆图统计信息
memory_graph = self._hippocampus.memory_graph.G
node_count = len(memory_graph.nodes())
edge_count = len(memory_graph.edges())
logger.success("--------------------------------")
logger.success("记忆图统计信息:")
logger.success(f"记忆节点数量: {node_count}")
logger.success(f"记忆连接数量: {edge_count}")
logger.success("--------------------------------")
return self._hippocampus
async def build_memory(self):
@@ -879,7 +1103,7 @@ class HippocampusManager:
raise RuntimeError("HippocampusManager 尚未初始化,请先调用 initialize 方法")
return await self._hippocampus.parahippocampal_gyrus.operation_build_memory()
async def forget_memory(self, percentage: float = 0.1):
async def forget_memory(self, percentage: float = 0.005):
"""遗忘记忆的公共接口"""
if not self._initialized:
raise RuntimeError("HippocampusManager 尚未初始化,请先调用 initialize 方法")