From d30f35fe20b6c87d40e275fe40c4a0fee382ed9b Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Wed, 4 Jun 2025 22:24:20 +0800 Subject: [PATCH] =?UTF-8?q?update=EF=BC=9A=E6=9B=B4=E6=96=B0=E8=84=9A?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/analyze_group_similarity.py | 96 ++++++++++--------- .../focus_chat/planners/planner_simple.py | 4 - template/bot_config_template.toml | 10 +- 3 files changed, 56 insertions(+), 54 deletions(-) diff --git a/scripts/analyze_group_similarity.py b/scripts/analyze_group_similarity.py index 7831b62bd..b61167f70 100644 --- a/scripts/analyze_group_similarity.py +++ b/scripts/analyze_group_similarity.py @@ -48,7 +48,7 @@ def load_group_data(group_dir): """加载单个群组的数据""" json_path = Path(group_dir) / "expressions.json" if not json_path.exists(): - return [], [], [] + return [], [], [], 0 with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) @@ -56,6 +56,7 @@ def load_group_data(group_dir): situations = [] styles = [] combined = [] + total_count = sum(item["count"] for item in data) for item in data: count = item["count"] @@ -63,41 +64,46 @@ def load_group_data(group_dir): styles.extend([item["style"]] * count) combined.extend([f"{item['situation']} {item['style']}"] * count) - return situations, styles, combined + return situations, styles, combined, total_count def analyze_group_similarity(): # 获取所有群组目录 base_dir = Path("data/expression/learnt_style") group_dirs = [d for d in base_dir.iterdir() if d.is_dir()] - group_ids = [d.name for d in group_dirs] - - # 获取群组名称 - group_names = [get_group_name(group_id) for group_id in group_ids] - - # 加载所有群组的数据 - group_situations = [] - group_styles = [] - group_combined = [] - + + # 加载所有群组的数据并过滤 + valid_groups = [] + valid_names = [] + valid_situations = [] + valid_styles = [] + valid_combined = [] + for d in group_dirs: - situations, styles, combined = load_group_data(d) - group_situations.append(" ".join(situations)) - group_styles.append(" ".join(styles)) - group_combined.append(" ".join(combined)) - + situations, styles, combined, total_count = load_group_data(d) + if total_count >= 50: # 只保留数据量大于等于50的群组 + valid_groups.append(d) + valid_names.append(get_group_name(d.name)) + valid_situations.append(" ".join(situations)) + valid_styles.append(" ".join(styles)) + valid_combined.append(" ".join(combined)) + + if not valid_groups: + print("没有找到数据量大于等于50的群组") + return + # 创建TF-IDF向量化器 vectorizer = TfidfVectorizer() # 计算三种相似度矩阵 - situation_matrix = cosine_similarity(vectorizer.fit_transform(group_situations)) - style_matrix = cosine_similarity(vectorizer.fit_transform(group_styles)) - combined_matrix = cosine_similarity(vectorizer.fit_transform(group_combined)) + situation_matrix = cosine_similarity(vectorizer.fit_transform(valid_situations)) + style_matrix = cosine_similarity(vectorizer.fit_transform(valid_styles)) + combined_matrix = cosine_similarity(vectorizer.fit_transform(valid_combined)) # 对相似度矩阵进行对数变换 - log_situation_matrix = np.log1p(situation_matrix) - log_style_matrix = np.log1p(style_matrix) - log_combined_matrix = np.log1p(combined_matrix) + log_situation_matrix = np.log10(situation_matrix * 100 + 1) * 10 / np.log10(4) + log_style_matrix = np.log10(style_matrix * 100 + 1) * 10 / np.log10(4) + log_combined_matrix = np.log10(combined_matrix * 100 + 1) * 10 / np.log10(4) # 创建一个大图,包含三个子图 plt.figure(figsize=(45, 12)) @@ -106,45 +112,45 @@ def analyze_group_similarity(): plt.subplot(1, 3, 1) sns.heatmap( log_situation_matrix, - xticklabels=group_names, - yticklabels=group_names, + xticklabels=valid_names, + yticklabels=valid_names, cmap="YlOrRd", annot=True, - fmt=".2f", + fmt=".1f", vmin=0, - vmax=np.log1p(0.2), + vmax=30, ) - plt.title("群组场景相似度热力图 (对数变换)") + plt.title("群组场景相似度热力图 (对数百分比)") plt.xticks(rotation=45, ha="right") # 表达方式相似度热力图 plt.subplot(1, 3, 2) sns.heatmap( log_style_matrix, - xticklabels=group_names, - yticklabels=group_names, + xticklabels=valid_names, + yticklabels=valid_names, cmap="YlOrRd", annot=True, - fmt=".2f", + fmt=".1f", vmin=0, - vmax=np.log1p(0.2), + vmax=30, ) - plt.title("群组表达方式相似度热力图 (对数变换)") + plt.title("群组表达方式相似度热力图 (对数百分比)") plt.xticks(rotation=45, ha="right") # 组合相似度热力图 plt.subplot(1, 3, 3) sns.heatmap( log_combined_matrix, - xticklabels=group_names, - yticklabels=group_names, + xticklabels=valid_names, + yticklabels=valid_names, cmap="YlOrRd", annot=True, - fmt=".2f", + fmt=".1f", vmin=0, - vmax=np.log1p(0.2), + vmax=30, ) - plt.title("群组场景+表达方式相似度热力图 (对数变换)") + plt.title("群组场景+表达方式相似度热力图 (对数百分比)") plt.xticks(rotation=45, ha="right") plt.tight_layout() @@ -156,18 +162,18 @@ def analyze_group_similarity(): f.write("群组相似度详情\n") f.write("=" * 50 + "\n\n") - for i in range(len(group_ids)): - for j in range(i + 1, len(group_ids)): - if log_combined_matrix[i][j] > np.log1p(0.05): - f.write(f"群组1: {group_names[i]}\n") - f.write(f"群组2: {group_names[j]}\n") + for i in range(len(valid_names)): + for j in range(i + 1, len(valid_names)): + if log_combined_matrix[i][j] > 50: + f.write(f"群组1: {valid_names[i]}\n") + f.write(f"群组2: {valid_names[j]}\n") f.write(f"场景相似度: {situation_matrix[i][j]:.4f}\n") f.write(f"表达方式相似度: {style_matrix[i][j]:.4f}\n") f.write(f"组合相似度: {combined_matrix[i][j]:.4f}\n") # 获取两个群组的数据 - situations1, styles1, _ = load_group_data(group_dirs[i]) - situations2, styles2, _ = load_group_data(group_dirs[j]) + situations1, styles1, _ = load_group_data(valid_groups[i]) + situations2, styles2, _ = load_group_data(valid_groups[j]) # 找出共同的场景 common_situations = set(situations1) & set(situations2) diff --git a/src/chat/focus_chat/planners/planner_simple.py b/src/chat/focus_chat/planners/planner_simple.py index 4e9477268..cddd53e2d 100644 --- a/src/chat/focus_chat/planners/planner_simple.py +++ b/src/chat/focus_chat/planners/planner_simple.py @@ -187,10 +187,6 @@ class ActionPlanner(BasePlanner): prompt = f"{prompt}" llm_content, (reasoning_content, _) = await self.planner_llm.generate_response_async(prompt=prompt) - # logger.info( - # f"{self.log_prefix}规划器Prompt:\n{prompt}\n\nLLM 原始响应: {llm_content}'" - # ) - logger.debug(f"{self.log_prefix}LLM 原始理由响应: {reasoning_content}") except Exception as req_e: logger.error(f"{self.log_prefix}LLM 请求执行失败: {req_e}") diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 9fe19224f..fc11ebbf0 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -115,19 +115,19 @@ content_filtration = false # 是否启用表情包过滤,只有符合该要 filtration_prompt = "符合公序良俗" # 表情包过滤要求,只有符合该要求的表情包才会被保存 [memory] -memory_build_interval = 2000 # 记忆构建间隔 单位秒 间隔越低,麦麦学习越多,但是冗余信息也会增多 +memory_build_interval = 1000 # 记忆构建间隔 单位秒 间隔越低,麦麦学习越多,但是冗余信息也会增多 memory_build_distribution = [6.0, 3.0, 0.6, 32.0, 12.0, 0.4] # 记忆构建分布,参数:分布1均值,标准差,权重,分布2均值,标准差,权重 -memory_build_sample_num = 6 # 采样数量,数值越高记忆采样次数越多 -memory_build_sample_length = 40 # 采样长度,数值越高一段记忆内容越丰富 +memory_build_sample_num = 4 # 采样数量,数值越高记忆采样次数越多 +memory_build_sample_length = 30 # 采样长度,数值越高一段记忆内容越丰富 memory_compress_rate = 0.1 # 记忆压缩率 控制记忆精简程度 建议保持默认,调高可以获得更多信息,但是冗余信息也会增多 forget_memory_interval = 1000 # 记忆遗忘间隔 单位秒 间隔越低,麦麦遗忘越频繁,记忆更精简,但更难学习 memory_forget_time = 24 #多长时间后的记忆会被遗忘 单位小时 memory_forget_percentage = 0.01 # 记忆遗忘比例 控制记忆遗忘程度 越大遗忘越多 建议保持默认 -consolidate_memory_interval = 2000 # 记忆整合间隔 单位秒 间隔越低,麦麦整合越频繁,记忆更精简 +consolidate_memory_interval = 1000 # 记忆整合间隔 单位秒 间隔越低,麦麦整合越频繁,记忆更精简 consolidation_similarity_threshold = 0.7 # 相似度阈值 -consolidation_check_percentage = 0.01 # 检查节点比例 +consolidation_check_percentage = 0.05 # 检查节点比例 #不希望记忆的词,已经记忆的不会受到影响,需要手动清理 memory_ban_words = [ "表情包", "图片", "回复", "聊天记录" ]