Merge branch 'main-fix' into main

2025-03-21 19:23:25 +09:00
parent cafa534008 7c6d787e65
commit e672a47f3d
48 changed files with 1358 additions and 1841 deletions
--- a/src/plugins/memory_system/memory.py
+++ b/src/plugins/memory_system/memory.py
@@ -18,6 +18,7 @@ from ..chat.utils import (
 )
 from ..models.utils_model import LLM_request
 from src.common.logger import get_module_logger, LogConfig, MEMORY_STYLE_CONFIG
+from src.plugins.memory_system.sample_distribution import MemoryBuildScheduler

 # 定义日志配置
 memory_config = LogConfig(
@@ -25,6 +26,11 @@ memory_config = LogConfig(
    console_format=MEMORY_STYLE_CONFIG["console_format"],
    file_format=MEMORY_STYLE_CONFIG["file_format"],
 )
+# print(f"memory_config: {memory_config}")
+# print(f"MEMORY_STYLE_CONFIG: {MEMORY_STYLE_CONFIG}")
+# print(f"MEMORY_STYLE_CONFIG['console_format']: {MEMORY_STYLE_CONFIG['console_format']}")
+# print(f"MEMORY_STYLE_CONFIG['file_format']: {MEMORY_STYLE_CONFIG['file_format']}")
+

 logger = get_module_logger("memory_system", config=memory_config)

@@ -168,9 +174,9 @@ class Memory_graph:
 class Hippocampus:
    def __init__(self, memory_graph: Memory_graph):
        self.memory_graph = memory_graph
-        self.llm_topic_judge = LLM_request(model=global_config.llm_topic_judge, temperature=0.5, request_type="topic")
+        self.llm_topic_judge = LLM_request(model=global_config.llm_topic_judge, temperature=0.5, request_type="memory")
        self.llm_summary_by_topic = LLM_request(
-            model=global_config.llm_summary_by_topic, temperature=0.5, request_type="topic"
+            model=global_config.llm_summary_by_topic, temperature=0.5, request_type="memory"
        )

    def get_all_node_names(self) -> list:
@@ -195,25 +201,17 @@ class Hippocampus:
        return hash(f"{nodes[0]}:{nodes[1]}")

    def random_get_msg_snippet(self, target_timestamp: float, chat_size: int, max_memorized_time_per_msg: int) -> list:
-        """随机抽取一段时间内的消息片段
-        Args:
-            - target_timestamp: 目标时间戳
-            - chat_size: 抽取的消息数量
-            - max_memorized_time_per_msg: 每条消息的最大记忆次数
-
-        Returns:
-            - list: 抽取出的消息记录列表
-
-        """
        try_count = 0
-        # 最多尝试三次抽取
+        # 最多尝试2次抽取
        while try_count < 3:
            messages = get_closest_chat_from_db(length=chat_size, timestamp=target_timestamp)
            if messages:
+                # print(f"抽取到的消息: {messages}")
                # 检查messages是否均没有达到记忆次数限制
                for message in messages:
                    if message["memorized_times"] >= max_memorized_time_per_msg:
                        messages = None
+                        # print(f"抽取到的消息提取次数达到限制，跳过")
                        break
                if messages:
                    # 成功抽取短期消息样本
@@ -224,63 +222,48 @@ class Hippocampus:
                        )
                    return messages
            try_count += 1
-        # 三次尝试均失败
        return None

-    def get_memory_sample(self, chat_size=20, time_frequency=None):
-        """获取记忆样本
-
-        Returns:
-            list: 消息记录列表，每个元素是一个消息记录字典列表
-        """
+    def get_memory_sample(self):
        # 硬编码：每条消息最大记忆次数
        # 如有需求可写入global_config
-        if time_frequency is None:
-            time_frequency = {"near": 2, "mid": 4, "far": 3}
        max_memorized_time_per_msg = 3

-        current_timestamp = datetime.datetime.now().timestamp()
+        # 创建双峰分布的记忆调度器
+        scheduler = MemoryBuildScheduler(
+            n_hours1=global_config.memory_build_distribution[0],           # 第一个分布均值（4小时前）
+            std_hours1=global_config.memory_build_distribution[1],         # 第一个分布标准差
+            weight1=global_config.memory_build_distribution[2],          # 第一个分布权重 60%
+            n_hours2=global_config.memory_build_distribution[3],          # 第二个分布均值（24小时前）
+            std_hours2=global_config.memory_build_distribution[4],         # 第二个分布标准差
+            weight2=global_config.memory_build_distribution[5],          # 第二个分布权重 40%
+            total_samples=global_config.build_memory_sample_num      # 总共生成10个时间点
+        )
+
+        # 生成时间戳数组
+        timestamps = scheduler.get_timestamp_array()
+        # logger.debug(f"生成的时间戳数组: {timestamps}")
+        # print(f"生成的时间戳数组: {timestamps}")
+        # print(f"时间戳的实际时间: {[time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts)) for ts in timestamps]}")
+        logger.info(f"回忆往事: {[time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts)) for ts in timestamps]}")
        chat_samples = []
-
-        # 短期：1h   中期：4h   长期：24h
-        logger.debug("正在抽取短期消息样本")
-        for i in range(time_frequency.get("near")):
-            random_time = current_timestamp - random.randint(1, 3600)
-            messages = self.random_get_msg_snippet(random_time, chat_size, max_memorized_time_per_msg)
+        for timestamp in timestamps:
+            messages = self.random_get_msg_snippet(
+                timestamp, 
+                global_config.build_memory_sample_length, 
+                max_memorized_time_per_msg
+            )
            if messages:
-                logger.debug(f"成功抽取短期消息样本{len(messages)}条")
+                time_diff = (datetime.datetime.now().timestamp() - timestamp) / 3600
+                logger.debug(f"成功抽取 {time_diff:.1f} 小时前的消息样本，共{len(messages)}条")
+                # print(f"成功抽取 {time_diff:.1f} 小时前的消息样本，共{len(messages)}条")
                chat_samples.append(messages)
            else:
-                logger.warning(f"第{i}次短期消息样本抽取失败")
-
-        logger.debug("正在抽取中期消息样本")
-        for i in range(time_frequency.get("mid")):
-            random_time = current_timestamp - random.randint(3600, 3600 * 4)
-            messages = self.random_get_msg_snippet(random_time, chat_size, max_memorized_time_per_msg)
-            if messages:
-                logger.debug(f"成功抽取中期消息样本{len(messages)}条")
-                chat_samples.append(messages)
-            else:
-                logger.warning(f"第{i}次中期消息样本抽取失败")
-
-        logger.debug("正在抽取长期消息样本")
-        for i in range(time_frequency.get("far")):
-            random_time = current_timestamp - random.randint(3600 * 4, 3600 * 24)
-            messages = self.random_get_msg_snippet(random_time, chat_size, max_memorized_time_per_msg)
-            if messages:
-                logger.debug(f"成功抽取长期消息样本{len(messages)}条")
-                chat_samples.append(messages)
-            else:
-                logger.warning(f"第{i}次长期消息样本抽取失败")
+                logger.debug(f"时间戳 {timestamp} 的消息样本抽取失败")

        return chat_samples

    async def memory_compress(self, messages: list, compress_rate=0.1):
-        """压缩消息记录为记忆
-
-        Returns:
-            tuple: (压缩记忆集合, 相似主题字典)
-        """
        if not messages:
            return set(), {}

@@ -313,15 +296,23 @@ class Hippocampus:
        topics_response = await self.llm_topic_judge.generate_response(self.find_topic_llm(input_text, topic_num))

        # 过滤topics
+        # 从配置文件获取需要过滤的关键词列表
        filter_keywords = global_config.memory_ban_words
+        
+        # 将topics_response[0]中的中文逗号、顿号、空格都替换成英文逗号
+        # 然后按逗号分割成列表,并去除每个topic前后的空白字符
        topics = [
            topic.strip()
            for topic in topics_response[0].replace("，", ",").replace("、", ",").replace(" ", ",").split(",")
            if topic.strip()
        ]
+        
+        # 过滤掉包含禁用关键词的topic
+        # any()检查topic中是否包含任何一个filter_keywords中的关键词
+        # 只保留不包含禁用关键词的topic
        filtered_topics = [topic for topic in topics if not any(keyword in topic for keyword in filter_keywords)]

-        logger.info(f"过滤后话题: {filtered_topics}")
+        logger.debug(f"过滤后话题: {filtered_topics}")

        # 创建所有话题的请求任务
        tasks = []
@@ -331,31 +322,42 @@ class Hippocampus:
            tasks.append((topic.strip(), task))

        # 等待所有任务完成
-        compressed_memory = set()
+        # 初始化压缩后的记忆集合和相似主题字典
+        compressed_memory = set()  # 存储压缩后的(主题,内容)元组
        similar_topics_dict = {}  # 存储每个话题的相似主题列表
+        
+        # 遍历每个主题及其对应的LLM任务
        for topic, task in tasks:
            response = await task
            if response:
+                # 将主题和LLM生成的内容添加到压缩记忆中
                compressed_memory.add((topic, response[0]))
-                # 为每个话题查找相似的已存在主题
+                
+                # 为当前主题寻找相似的已存在主题
                existing_topics = list(self.memory_graph.G.nodes())
                similar_topics = []

+                # 计算当前主题与每个已存在主题的相似度
                for existing_topic in existing_topics:
+                    # 使用jieba分词,将主题转换为词集合
                    topic_words = set(jieba.cut(topic))
                    existing_words = set(jieba.cut(existing_topic))

-                    all_words = topic_words | existing_words
-                    v1 = [1 if word in topic_words else 0 for word in all_words]
-                    v2 = [1 if word in existing_words else 0 for word in all_words]
+                    # 构建词向量用于计算余弦相似度
+                    all_words = topic_words | existing_words  # 所有不重复的词
+                    v1 = [1 if word in topic_words else 0 for word in all_words]  # 当前主题的词向量
+                    v2 = [1 if word in existing_words else 0 for word in all_words]  # 已存在主题的词向量

+                    # 计算余弦相似度
                    similarity = cosine_similarity(v1, v2)

-                    if similarity >= 0.6:
+                    # 如果相似度超过阈值,添加到相似主题列表
+                    if similarity >= 0.7:
                        similar_topics.append((existing_topic, similarity))

+                # 按相似度降序排序,只保留前3个最相似的主题
                similar_topics.sort(key=lambda x: x[1], reverse=True)
-                similar_topics = similar_topics[:5]
+                similar_topics = similar_topics[:3]
                similar_topics_dict[topic] = similar_topics

        return compressed_memory, similar_topics_dict
@@ -372,10 +374,13 @@ class Hippocampus:
        )
        return topic_num

-    async def operation_build_memory(self, chat_size=20):
-        time_frequency = {"near": 1, "mid": 4, "far": 4}
-        memory_samples = self.get_memory_sample(chat_size, time_frequency)
-
+    async def operation_build_memory(self):
+        logger.debug("------------------------------------开始构建记忆--------------------------------------")
+        start_time = time.time()
+        memory_samples = self.get_memory_sample()
+        all_added_nodes = []
+        all_connected_nodes = []
+        all_added_edges = []
        for i, messages in enumerate(memory_samples, 1):
            all_topics = []
            # 加载进度可视化
@@ -387,12 +392,14 @@ class Hippocampus:

            compress_rate = global_config.memory_compress_rate
            compressed_memory, similar_topics_dict = await self.memory_compress(messages, compress_rate)
-            logger.info(f"压缩后记忆数量: {len(compressed_memory)}，似曾相识的话题: {len(similar_topics_dict)}")
+            logger.debug(f"压缩后记忆数量: {compressed_memory}，似曾相识的话题: {similar_topics_dict}")

            current_time = datetime.datetime.now().timestamp()
-
+            logger.debug(f"添加节点: {', '.join(topic for topic, _ in compressed_memory)}")
+            all_added_nodes.extend(topic for topic, _ in compressed_memory)
+            # all_connected_nodes.extend(topic for topic, _ in similar_topics_dict)
+            
            for topic, memory in compressed_memory:
-                logger.info(f"添加节点: {topic}")
                self.memory_graph.add_dot(topic, memory)
                all_topics.append(topic)

@@ -402,7 +409,13 @@ class Hippocampus:
                    for similar_topic, similarity in similar_topics:
                        if topic != similar_topic:
                            strength = int(similarity * 10)
-                            logger.info(f"连接相似节点: {topic} 和 {similar_topic} (强度: {strength})")
+                            
+                            logger.debug(f"连接相似节点: {topic} 和 {similar_topic} (强度: {strength})")
+                            all_added_edges.append(f"{topic}-{similar_topic}")
+                            
+                            all_connected_nodes.append(topic)
+                            all_connected_nodes.append(similar_topic)
+                            
                            self.memory_graph.G.add_edge(
                                topic,
                                similar_topic,
@@ -414,10 +427,21 @@ class Hippocampus:
            # 连接同批次的相关话题
            for i in range(len(all_topics)):
                for j in range(i + 1, len(all_topics)):
-                    logger.info(f"连接同批次节点: {all_topics[i]} 和 {all_topics[j]}")
+                    logger.debug(f"连接同批次节点: {all_topics[i]} 和 {all_topics[j]}")
+                    all_added_edges.append(f"{all_topics[i]}-{all_topics[j]}")
                    self.memory_graph.connect_dot(all_topics[i], all_topics[j])

+        logger.success(f"更新记忆: {', '.join(all_added_nodes)}")
+        logger.debug(f"强化连接: {', '.join(all_added_edges)}")
+        logger.info(f"强化连接节点: {', '.join(all_connected_nodes)}")
+        # logger.success(f"强化连接: {', '.join(all_added_edges)}")
        self.sync_memory_to_db()
+        
+        end_time = time.time()
+        logger.success(
+            f"--------------------------记忆构建完成：耗时: {end_time - start_time:.2f} "
+            "秒--------------------------"
+        )

    def sync_memory_to_db(self):
        """检查并同步内存中的图结构与数据库"""
@@ -844,10 +868,9 @@ class Hippocampus:

    async def memory_activate_value(self, text: str, max_topics: int = 5, similarity_threshold: float = 0.3) -> int:
        """计算输入文本对记忆的激活程度"""
-        logger.info(f"识别主题: {await self._identify_topics(text)}")
-
        # 识别主题
        identified_topics = await self._identify_topics(text)
+        
        if not identified_topics:
            return 0

@@ -908,7 +931,8 @@ class Hippocampus:

        # 计算最终激活值
        activation = int((topic_match + average_similarities) / 2 * 100)
-        logger.info(f"匹配率: {topic_match:.3f}, 平均相似度: {average_similarities:.3f}, 激活值: {activation}")
+        
+        logger.info(f"识别主题: {identified_topics}, 匹配率: {topic_match:.3f}, 激活值: {activation}")

        return activation

--- a/src/plugins/memory_system/memory_manual_build.py
+++ b/src/plugins/memory_system/memory_manual_build.py
@@ -7,7 +7,6 @@ import sys
 import time
 from collections import Counter
 from pathlib import Path
-
 import matplotlib.pyplot as plt
 import networkx as nx
 from dotenv import load_dotenv
@@ -16,7 +15,6 @@ sys.path.insert(0, sys.path[0]+"/../")
 sys.path.insert(0, sys.path[0]+"/../")
 sys.path.insert(0, sys.path[0]+"/../")
 sys.path.insert(0, sys.path[0]+"/../")
-print(sys.path)
 from src.common.logger import get_module_logger
 import jieba

@@ -25,6 +23,7 @@ import jieba
 root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
 sys.path.append(root_path)

+from src.common.logger import get_module_logger # noqa: E402
 from src.common.database import db  # noqa E402
 from src.plugins.memory_system.offline_llm import LLMModel  # noqa E402

--- a/src/plugins/memory_system/memory_test1.py
+++ b/src/plugins/memory_system/memory_test1.py
--- a/src/plugins/memory_system/sample_distribution.py
+++ b/src/plugins/memory_system/sample_distribution.py
@@ -0,0 +1,170 @@
+import numpy as np
+from scipy import stats
+from datetime import datetime, timedelta
+
+class DistributionVisualizer:
+    def __init__(self, mean=0, std=1, skewness=0, sample_size=10):
+        """
+        初始化分布可视化器
+        
+        参数:
+            mean (float): 期望均值
+            std (float): 标准差
+            skewness (float): 偏度
+            sample_size (int): 样本大小
+        """
+        self.mean = mean
+        self.std = std
+        self.skewness = skewness
+        self.sample_size = sample_size
+        self.samples = None
+        
+    def generate_samples(self):
+        """生成具有指定参数的样本"""
+        if self.skewness == 0:
+            # 对于无偏度的情况，直接使用正态分布
+            self.samples = np.random.normal(loc=self.mean, scale=self.std, size=self.sample_size)
+        else:
+            # 使用 scipy.stats 生成具有偏度的分布
+            self.samples = stats.skewnorm.rvs(a=self.skewness, 
+                                            loc=self.mean, 
+                                            scale=self.std, 
+                                            size=self.sample_size)
+    
+    def get_weighted_samples(self):
+        """获取加权后的样本数列"""
+        if self.samples is None:
+            self.generate_samples()
+        # 将样本值乘以样本大小
+        return self.samples * self.sample_size
+        
+    def get_statistics(self):
+        """获取分布的统计信息"""
+        if self.samples is None:
+            self.generate_samples()
+            
+        return {
+            "均值": np.mean(self.samples),
+            "标准差": np.std(self.samples),
+            "实际偏度": stats.skew(self.samples)
+        }
+
+class MemoryBuildScheduler:
+    def __init__(self, 
+                 n_hours1, std_hours1, weight1,
+                 n_hours2, std_hours2, weight2,
+                 total_samples=50):
+        """
+        初始化记忆构建调度器
+        
+        参数:
+            n_hours1 (float): 第一个分布的均值（距离现在的小时数）
+            std_hours1 (float): 第一个分布的标准差（小时）
+            weight1 (float): 第一个分布的权重
+            n_hours2 (float): 第二个分布的均值（距离现在的小时数）
+            std_hours2 (float): 第二个分布的标准差（小时）
+            weight2 (float): 第二个分布的权重
+            total_samples (int): 要生成的总时间点数量
+        """
+        # 归一化权重
+        total_weight = weight1 + weight2
+        self.weight1 = weight1 / total_weight
+        self.weight2 = weight2 / total_weight
+        
+        self.n_hours1 = n_hours1
+        self.std_hours1 = std_hours1
+        self.n_hours2 = n_hours2
+        self.std_hours2 = std_hours2
+        self.total_samples = total_samples
+        self.base_time = datetime.now()
+        
+    def generate_time_samples(self):
+        """生成混合分布的时间采样点"""
+        # 根据权重计算每个分布的样本数
+        samples1 = int(self.total_samples * self.weight1)
+        samples2 = self.total_samples - samples1
+        
+        # 生成两个正态分布的小时偏移
+        hours_offset1 = np.random.normal(
+            loc=self.n_hours1,
+            scale=self.std_hours1,
+            size=samples1
+        )
+        
+        hours_offset2 = np.random.normal(
+            loc=self.n_hours2,
+            scale=self.std_hours2,
+            size=samples2
+        )
+        
+        # 合并两个分布的偏移
+        hours_offset = np.concatenate([hours_offset1, hours_offset2])
+        
+        # 将偏移转换为实际时间戳（使用绝对值确保时间点在过去）
+        timestamps = [self.base_time - timedelta(hours=abs(offset)) for offset in hours_offset]
+        
+        # 按时间排序（从最早到最近）
+        return sorted(timestamps)
+
+    def get_timestamp_array(self):
+        """返回时间戳数组"""
+        timestamps = self.generate_time_samples()
+        return [int(t.timestamp()) for t in timestamps]
+
+def print_time_samples(timestamps, show_distribution=True):
+    """打印时间样本和分布信息"""
+    print(f"\n生成的{len(timestamps)}个时间点分布：")
+    print("序号".ljust(5), "时间戳".ljust(25), "距现在（小时）")
+    print("-" * 50)
+    
+    now = datetime.now()
+    time_diffs = []
+    
+    for i, timestamp in enumerate(timestamps, 1):
+        hours_diff = (now - timestamp).total_seconds() / 3600
+        time_diffs.append(hours_diff)
+        print(f"{str(i).ljust(5)} {timestamp.strftime('%Y-%m-%d %H:%M:%S').ljust(25)} {hours_diff:.2f}")
+    
+    # 打印统计信息
+    print("\n统计信息：")
+    print(f"平均时间偏移：{np.mean(time_diffs):.2f}小时")
+    print(f"标准差：{np.std(time_diffs):.2f}小时")
+    print(f"最早时间：{min(timestamps).strftime('%Y-%m-%d %H:%M:%S')} ({max(time_diffs):.2f}小时前)")
+    print(f"最近时间：{max(timestamps).strftime('%Y-%m-%d %H:%M:%S')} ({min(time_diffs):.2f}小时前)")
+    
+    if show_distribution:
+        # 计算时间分布的直方图
+        hist, bins = np.histogram(time_diffs, bins=40)
+        print("\n时间分布（每个*代表一个时间点）：")
+        for i in range(len(hist)):
+            if hist[i] > 0:
+                print(f"{bins[i]:6.1f}-{bins[i+1]:6.1f}小时: {'*' * int(hist[i])}")
+
+# 使用示例
+if __name__ == "__main__":
+    # 创建一个双峰分布的记忆调度器
+    scheduler = MemoryBuildScheduler(
+        n_hours1=12,          # 第一个分布均值（12小时前）
+        std_hours1=8,         # 第一个分布标准差
+        weight1=0.7,          # 第一个分布权重 70%
+        n_hours2=36,          # 第二个分布均值（36小时前）
+        std_hours2=24,        # 第二个分布标准差
+        weight2=0.3,          # 第二个分布权重 30%
+        total_samples=50      # 总共生成50个时间点
+    )
+    
+    # 生成时间分布
+    timestamps = scheduler.generate_time_samples()
+    
+    # 打印结果，包含分布可视化
+    print_time_samples(timestamps, show_distribution=True)
+    
+    # 打印时间戳数组
+    timestamp_array = scheduler.get_timestamp_array()
+    print("\n时间戳数组（Unix时间戳）：")
+    print("[", end="")
+    for i, ts in enumerate(timestamp_array):
+        if i > 0:
+            print(", ", end="")
+        print(ts, end="")
+    print("]")