Files
Mofox-Core/scripts/analyze_expressions.py

216 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import time
import re
from datetime import datetime
from typing import Dict, List, Any
import sqlite3
def clean_group_name(name: str) -> str:
"""清理群组名称,只保留中文和英文字符"""
# 提取中文和英文字符
cleaned = re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", "", name)
# 如果清理后为空,使用当前日期
if not cleaned:
cleaned = datetime.now().strftime("%Y%m%d")
return cleaned
def get_group_name(stream_id: str) -> str:
"""从数据库中获取群组名称"""
conn = sqlite3.connect("data/maibot.db")
cursor = conn.cursor()
cursor.execute(
"""
SELECT group_name, user_nickname, platform
FROM chat_streams
WHERE stream_id = ?
""",
(stream_id,),
)
result = cursor.fetchone()
conn.close()
if result:
group_name, user_nickname, platform = result
if group_name:
return clean_group_name(group_name)
if user_nickname:
return clean_group_name(user_nickname)
if platform:
return clean_group_name(f"{platform}{stream_id[:8]}")
return stream_id
def load_expressions(chat_id: str) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
"""加载指定群组的表达方式"""
learnt_style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
learnt_grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
personality_file = os.path.join("data", "expression", "personality", "expressions.json")
style_expressions = []
grammar_expressions = []
personality_expressions = []
if os.path.exists(learnt_style_file):
with open(learnt_style_file, "r", encoding="utf-8") as f:
style_expressions = json.load(f)
if os.path.exists(learnt_grammar_file):
with open(learnt_grammar_file, "r", encoding="utf-8") as f:
grammar_expressions = json.load(f)
if os.path.exists(personality_file):
with open(personality_file, "r", encoding="utf-8") as f:
personality_expressions = json.load(f)
return style_expressions, grammar_expressions, personality_expressions
def format_time(timestamp: float) -> str:
"""格式化时间戳为可读字符串"""
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
def write_expressions(f, expressions: List[Dict[str, Any]], title: str):
"""写入表达方式列表"""
if not expressions:
f.write(f"{title}:暂无数据\n")
f.write("-" * 40 + "\n")
return
f.write(f"{title}\n")
for expr in expressions:
count = expr.get("count", 0)
last_active = expr.get("last_active_time", time.time())
f.write(f"场景: {expr['situation']}\n")
f.write(f"表达: {expr['style']}\n")
f.write(f"计数: {count:.4f}\n")
f.write(f"最后活跃: {format_time(last_active)}\n")
f.write("-" * 40 + "\n")
def write_group_report(
group_file: str,
group_name: str,
chat_id: str,
style_exprs: List[Dict[str, Any]],
grammar_exprs: List[Dict[str, Any]],
):
"""写入群组详细报告"""
with open(group_file, "w", encoding="utf-8") as gf:
gf.write(f"群组: {group_name} (ID: {chat_id})\n")
gf.write("=" * 80 + "\n\n")
# 写入语言风格
gf.write("【语言风格】\n")
gf.write("=" * 40 + "\n")
write_expressions(gf, style_exprs, "语言风格")
gf.write("\n")
# 写入句法特点
gf.write("【句法特点】\n")
gf.write("=" * 40 + "\n")
write_expressions(gf, grammar_exprs, "句法特点")
def analyze_expressions():
"""分析所有群组的表达方式"""
# 获取所有群组ID
style_dir = os.path.join("data", "expression", "learnt_style")
chat_ids = [d for d in os.listdir(style_dir) if os.path.isdir(os.path.join(style_dir, d))]
# 创建输出目录
output_dir = "data/expression_analysis"
personality_dir = os.path.join(output_dir, "personality")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(personality_dir, exist_ok=True)
# 生成时间戳
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 创建总报告
summary_file = os.path.join(output_dir, f"summary_{timestamp}.txt")
with open(summary_file, "w", encoding="utf-8") as f:
f.write(f"表达方式分析报告 - 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("=" * 80 + "\n\n")
# 先处理人格表达
personality_exprs = []
personality_file = os.path.join("data", "expression", "personality", "expressions.json")
if os.path.exists(personality_file):
with open(personality_file, "r", encoding="utf-8") as pf:
personality_exprs = json.load(pf)
# 保存人格表达总数
total_personality = len(personality_exprs)
# 排序并取前20条
personality_exprs.sort(key=lambda x: x.get("count", 0), reverse=True)
personality_exprs = personality_exprs[:20]
# 写入人格表达报告
personality_report = os.path.join(personality_dir, f"expressions_{timestamp}.txt")
with open(personality_report, "w", encoding="utf-8") as pf:
pf.write("【人格表达方式】\n")
pf.write("=" * 40 + "\n")
write_expressions(pf, personality_exprs, "人格表达")
# 写入总报告摘要中的人格表达部分
f.write("【人格表达方式】\n")
f.write("=" * 40 + "\n")
f.write(f"人格表达总数: {total_personality} (显示前20条)\n")
f.write(f"详细报告: {personality_report}\n")
f.write("-" * 40 + "\n\n")
# 处理各个群组的表达方式
f.write("【群组表达方式】\n")
f.write("=" * 40 + "\n\n")
for chat_id in chat_ids:
style_exprs, grammar_exprs, _ = load_expressions(chat_id)
# 保存总数
total_style = len(style_exprs)
total_grammar = len(grammar_exprs)
# 分别排序
style_exprs.sort(key=lambda x: x.get("count", 0), reverse=True)
grammar_exprs.sort(key=lambda x: x.get("count", 0), reverse=True)
# 只取前20条
style_exprs = style_exprs[:20]
grammar_exprs = grammar_exprs[:20]
# 获取群组名称
group_name = get_group_name(chat_id)
# 创建群组子目录(使用清理后的名称)
safe_group_name = clean_group_name(group_name)
group_dir = os.path.join(output_dir, f"{safe_group_name}_{chat_id}")
os.makedirs(group_dir, exist_ok=True)
# 写入群组详细报告
group_file = os.path.join(group_dir, f"expressions_{timestamp}.txt")
write_group_report(group_file, group_name, chat_id, style_exprs, grammar_exprs)
# 写入总报告摘要
f.write(f"群组: {group_name} (ID: {chat_id})\n")
f.write("-" * 40 + "\n")
f.write(f"语言风格总数: {total_style} (显示前20条)\n")
f.write(f"句法特点总数: {total_grammar} (显示前20条)\n")
f.write(f"详细报告: {group_file}\n")
f.write("-" * 40 + "\n\n")
print("分析报告已生成:")
print(f"总报告: {summary_file}")
print(f"人格表达报告: {personality_report}")
print(f"各群组详细报告位于: {output_dir}")
if __name__ == "__main__":
analyze_expressions()