diff --git a/scripts/analyze_group_similarity.py b/scripts/analyze_group_similarity.py index 4d7e182ef..86863a52c 100644 --- a/scripts/analyze_group_similarity.py +++ b/scripts/analyze_group_similarity.py @@ -42,21 +42,26 @@ def get_group_name(stream_id): return f"{platform}-{stream_id[:8]}" return stream_id -def load_group_expressions(group_dir): - """加载单个群组的表达方式数据""" +def load_group_data(group_dir): + """加载单个群组的数据""" json_path = Path(group_dir) / "expressions.json" if not json_path.exists(): - return [] + return [], [], [] with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) - # 将所有表达方式合并成一个文本 - all_expressions = [] - for item in data: - all_expressions.extend([item['style']] * item['count']) + situations = [] + styles = [] + combined = [] - return ' '.join(all_expressions) + for item in data: + count = item['count'] + situations.extend([item['situation']] * count) + styles.extend([item['style']] * count) + combined.extend([f"{item['situation']} {item['style']}"] * count) + + return situations, styles, combined def analyze_group_similarity(): # 获取所有群组目录 @@ -67,70 +72,109 @@ def analyze_group_similarity(): # 获取群组名称 group_names = [get_group_name(group_id) for group_id in group_ids] - # 加载所有群组的表达方式 - group_texts = [load_group_expressions(d) for d in group_dirs] + # 加载所有群组的数据 + group_situations = [] + group_styles = [] + group_combined = [] - # 使用TF-IDF向量化文本 + for d in group_dirs: + situations, styles, combined = load_group_data(d) + group_situations.append(' '.join(situations)) + group_styles.append(' '.join(styles)) + group_combined.append(' '.join(combined)) + + # 创建TF-IDF向量化器 vectorizer = TfidfVectorizer() - tfidf_matrix = vectorizer.fit_transform(group_texts) - # 计算余弦相似度 - similarity_matrix = cosine_similarity(tfidf_matrix) + # 计算三种相似度矩阵 + situation_matrix = cosine_similarity(vectorizer.fit_transform(group_situations)) + style_matrix = cosine_similarity(vectorizer.fit_transform(group_styles)) + combined_matrix = cosine_similarity(vectorizer.fit_transform(group_combined)) # 对相似度矩阵进行对数变换 - log_similarity_matrix = np.log1p(similarity_matrix) + log_situation_matrix = np.log1p(situation_matrix) + log_style_matrix = np.log1p(style_matrix) + log_combined_matrix = np.log1p(combined_matrix) - # 创建热力图 - plt.figure(figsize=(15, 12)) - sns.heatmap(log_similarity_matrix, + # 创建一个大图,包含三个子图 + plt.figure(figsize=(45, 12)) + + # 场景相似度热力图 + plt.subplot(1, 3, 1) + sns.heatmap(log_situation_matrix, xticklabels=group_names, yticklabels=group_names, cmap='YlOrRd', annot=True, fmt='.2f', vmin=0, - vmax=np.log1p(0.2)) # 调整最大值以匹配对数变换 + vmax=np.log1p(0.2)) + plt.title('群组场景相似度热力图 (对数变换)') + plt.xticks(rotation=45, ha='right') + + # 表达方式相似度热力图 + plt.subplot(1, 3, 2) + sns.heatmap(log_style_matrix, + xticklabels=group_names, + yticklabels=group_names, + cmap='YlOrRd', + annot=True, + fmt='.2f', + vmin=0, + vmax=np.log1p(0.2)) plt.title('群组表达方式相似度热力图 (对数变换)') plt.xticks(rotation=45, ha='right') + + # 组合相似度热力图 + plt.subplot(1, 3, 3) + sns.heatmap(log_combined_matrix, + xticklabels=group_names, + yticklabels=group_names, + cmap='YlOrRd', + annot=True, + fmt='.2f', + vmin=0, + vmax=np.log1p(0.2)) + plt.title('群组场景+表达方式相似度热力图 (对数变换)') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() - plt.savefig(SCRIPT_DIR / 'group_similarity_heatmap.png', dpi=300, bbox_inches='tight') + plt.savefig(SCRIPT_DIR / 'group_similarity_heatmaps.png', dpi=300, bbox_inches='tight') plt.close() - # 创建网络图 - G = nx.Graph() - - # 添加节点 - for group_id, group_name in zip(group_ids, group_names): - G.add_node(group_id, label=group_name) - - # 添加边(使用对数变换后的相似度) - for i in range(len(group_ids)): - for j in range(i+1, len(group_ids)): - if log_similarity_matrix[i][j] > np.log1p(0.05): # 调整阈值 - G.add_edge(group_ids[i], group_ids[j], - weight=log_similarity_matrix[i][j]) - - # 绘制网络图 - plt.figure(figsize=(20, 20)) - pos = nx.spring_layout(G, k=1, iterations=50) - - # 绘制节点 - nx.draw_networkx_nodes(G, pos, node_size=20000, node_color='lightblue', alpha=0.8) - - # 绘制边 - edges = G.edges() - weights = [G[u][v]['weight'] * 40 for u, v in edges] # 增加线条粗细系数 - nx.draw_networkx_edges(G, pos, width=weights, alpha=0.6, edge_color='gray') - - # 添加标签 - labels = {node: G.nodes[node]['label'] for node in G.nodes()} - nx.draw_networkx_labels(G, pos, labels, font_size=20, font_weight='bold') - - plt.title('群组表达方式相似度网络图\n(连线粗细表示对数变换后的相似度)') - plt.axis('off') - plt.tight_layout() - plt.savefig(SCRIPT_DIR / 'group_similarity_network.png', dpi=300, bbox_inches='tight') - plt.close() + # 保存匹配详情到文本文件 + with open(SCRIPT_DIR / 'group_similarity_details.txt', 'w', encoding='utf-8') as f: + f.write('群组相似度详情\n') + f.write('=' * 50 + '\n\n') + + for i in range(len(group_ids)): + for j in range(i+1, len(group_ids)): + if log_combined_matrix[i][j] > np.log1p(0.05): + f.write(f'群组1: {group_names[i]}\n') + f.write(f'群组2: {group_names[j]}\n') + f.write(f'场景相似度: {situation_matrix[i][j]:.4f}\n') + f.write(f'表达方式相似度: {style_matrix[i][j]:.4f}\n') + f.write(f'组合相似度: {combined_matrix[i][j]:.4f}\n') + + # 获取两个群组的数据 + situations1, styles1, _ = load_group_data(group_dirs[i]) + situations2, styles2, _ = load_group_data(group_dirs[j]) + + # 找出共同的场景 + common_situations = set(situations1) & set(situations2) + if common_situations: + f.write('\n共同场景:\n') + for situation in common_situations: + f.write(f'- {situation}\n') + + # 找出共同的表达方式 + common_styles = set(styles1) & set(styles2) + if common_styles: + f.write('\n共同表达方式:\n') + for style in common_styles: + f.write(f'- {style}\n') + + f.write('\n' + '-' * 50 + '\n\n') if __name__ == "__main__": analyze_group_similarity() diff --git a/scripts/mongodb_to_sqlite.py b/scripts/mongodb_to_sqlite.py index c6d2950fd..edd27e435 100644 --- a/scripts/mongodb_to_sqlite.py +++ b/scripts/mongodb_to_sqlite.py @@ -182,25 +182,6 @@ class MongoToSQLiteMigrator: enable_validation=False, # 禁用数据验证 unique_fields=["stream_id"], ), - # LLM使用记录迁移配置 - MigrationConfig( - mongo_collection="llm_usage", - target_model=LLMUsage, - field_mapping={ - "model_name": "model_name", - "user_id": "user_id", - "request_type": "request_type", - "endpoint": "endpoint", - "prompt_tokens": "prompt_tokens", - "completion_tokens": "completion_tokens", - "total_tokens": "total_tokens", - "cost": "cost", - "status": "status", - "timestamp": "timestamp", - }, - enable_validation=True, # 禁用数据验证" - unique_fields=["user_id", "prompt_tokens", "completion_tokens", "total_tokens", "cost"], # 组合唯一性 - ), # 消息迁移配置 MigrationConfig( mongo_collection="messages", diff --git a/src/plugins/test_plugin/actions/mute_action.py b/src/plugins/test_plugin/actions/mute_action.py index 4069bf8a6..afd31eaae 100644 --- a/src/plugins/test_plugin/actions/mute_action.py +++ b/src/plugins/test_plugin/actions/mute_action.py @@ -22,7 +22,7 @@ class MuteAction(PluginAction): "当有人发了擦边,或者色情内容时使用", "当有人要求禁言自己时使用", ] - default = True # 默认动作,是否手动添加到使用集 + default = False # 默认动作,是否手动添加到使用集 associated_types = ["command", "text"] # associated_types = ["text"]