Files
Mofox-Core/scripts/cleanup_expressions.py
春河晴 3e854719ee ruff
2025-06-10 17:31:05 +09:00

126 lines
4.3 KiB
Python

import os
import json
import random
from typing import List, Dict, Tuple
import glob
MAX_EXPRESSION_COUNT = 300 # 每个群最多保留的表达方式数量
MIN_COUNT_THRESHOLD = 0.01 # 最小使用次数阈值
def load_expressions(chat_id: str) -> Tuple[List[Dict], List[Dict]]:
"""加载指定群聊的表达方式"""
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
style_exprs = []
grammar_exprs = []
if os.path.exists(style_file):
with open(style_file, "r", encoding="utf-8") as f:
style_exprs = json.load(f)
if os.path.exists(grammar_file):
with open(grammar_file, "r", encoding="utf-8") as f:
grammar_exprs = json.load(f)
return style_exprs, grammar_exprs
def save_expressions(chat_id: str, style_exprs: List[Dict], grammar_exprs: List[Dict]) -> None:
"""保存表达方式到文件"""
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
os.makedirs(os.path.dirname(style_file), exist_ok=True)
os.makedirs(os.path.dirname(grammar_file), exist_ok=True)
with open(style_file, "w", encoding="utf-8") as f:
json.dump(style_exprs, f, ensure_ascii=False, indent=2)
with open(grammar_file, "w", encoding="utf-8") as f:
json.dump(grammar_exprs, f, ensure_ascii=False, indent=2)
def cleanup_expressions(expressions: List[Dict]) -> List[Dict]:
"""清理表达方式列表"""
if not expressions:
return []
# 1. 移除使用次数过低的表达方式
expressions = [expr for expr in expressions if expr.get("count", 0) > MIN_COUNT_THRESHOLD]
# 2. 如果数量超过限制,随机删除多余的
if len(expressions) > MAX_EXPRESSION_COUNT:
# 按使用次数排序
expressions.sort(key=lambda x: x.get("count", 0), reverse=True)
# 保留前50%的高频表达方式
keep_count = MAX_EXPRESSION_COUNT // 2
keep_exprs = expressions[:keep_count]
# 从剩余的表达方式中随机选择
remaining_exprs = expressions[keep_count:]
random.shuffle(remaining_exprs)
keep_exprs.extend(remaining_exprs[: MAX_EXPRESSION_COUNT - keep_count])
expressions = keep_exprs
return expressions
def main():
# 获取所有群聊ID
style_dirs = glob.glob(os.path.join("data", "expression", "learnt_style", "*"))
chat_ids = [os.path.basename(d) for d in style_dirs]
if not chat_ids:
print("没有找到任何群聊的表达方式数据")
return
print(f"开始清理 {len(chat_ids)} 个群聊的表达方式数据...")
total_style_before = 0
total_style_after = 0
total_grammar_before = 0
total_grammar_after = 0
for chat_id in chat_ids:
print(f"\n处理群聊 {chat_id}:")
# 加载表达方式
style_exprs, grammar_exprs = load_expressions(chat_id)
# 记录清理前的数量
style_count_before = len(style_exprs)
grammar_count_before = len(grammar_exprs)
total_style_before += style_count_before
total_grammar_before += grammar_count_before
# 清理表达方式
style_exprs = cleanup_expressions(style_exprs)
grammar_exprs = cleanup_expressions(grammar_exprs)
# 记录清理后的数量
style_count_after = len(style_exprs)
grammar_count_after = len(grammar_exprs)
total_style_after += style_count_after
total_grammar_after += grammar_count_after
# 保存清理后的表达方式
save_expressions(chat_id, style_exprs, grammar_exprs)
print(f"语言风格: {style_count_before} -> {style_count_after}")
print(f"句法特点: {grammar_count_before} -> {grammar_count_after}")
print("\n清理完成!")
print(f"语言风格总数: {total_style_before} -> {total_style_after}")
print(f"句法特点总数: {total_grammar_before} -> {total_grammar_after}")
print(
f"总共清理了 {total_style_before + total_grammar_before - total_style_after - total_grammar_after} 条表达方式"
)
if __name__ == "__main__":
main()