126 lines
4.3 KiB
Python
126 lines
4.3 KiB
Python
import os
|
|
import json
|
|
import random
|
|
from typing import List, Dict, Tuple
|
|
import glob
|
|
|
|
MAX_EXPRESSION_COUNT = 300 # 每个群最多保留的表达方式数量
|
|
MIN_COUNT_THRESHOLD = 0.01 # 最小使用次数阈值
|
|
|
|
|
|
def load_expressions(chat_id: str) -> Tuple[List[Dict], List[Dict]]:
|
|
"""加载指定群聊的表达方式"""
|
|
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
|
grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
|
|
|
|
style_exprs = []
|
|
grammar_exprs = []
|
|
|
|
if os.path.exists(style_file):
|
|
with open(style_file, "r", encoding="utf-8") as f:
|
|
style_exprs = json.load(f)
|
|
|
|
if os.path.exists(grammar_file):
|
|
with open(grammar_file, "r", encoding="utf-8") as f:
|
|
grammar_exprs = json.load(f)
|
|
|
|
return style_exprs, grammar_exprs
|
|
|
|
|
|
def save_expressions(chat_id: str, style_exprs: List[Dict], grammar_exprs: List[Dict]) -> None:
|
|
"""保存表达方式到文件"""
|
|
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
|
grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
|
|
|
|
os.makedirs(os.path.dirname(style_file), exist_ok=True)
|
|
os.makedirs(os.path.dirname(grammar_file), exist_ok=True)
|
|
|
|
with open(style_file, "w", encoding="utf-8") as f:
|
|
json.dump(style_exprs, f, ensure_ascii=False, indent=2)
|
|
|
|
with open(grammar_file, "w", encoding="utf-8") as f:
|
|
json.dump(grammar_exprs, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def cleanup_expressions(expressions: List[Dict]) -> List[Dict]:
|
|
"""清理表达方式列表"""
|
|
if not expressions:
|
|
return []
|
|
|
|
# 1. 移除使用次数过低的表达方式
|
|
expressions = [expr for expr in expressions if expr.get("count", 0) > MIN_COUNT_THRESHOLD]
|
|
|
|
# 2. 如果数量超过限制,随机删除多余的
|
|
if len(expressions) > MAX_EXPRESSION_COUNT:
|
|
# 按使用次数排序
|
|
expressions.sort(key=lambda x: x.get("count", 0), reverse=True)
|
|
|
|
# 保留前50%的高频表达方式
|
|
keep_count = MAX_EXPRESSION_COUNT // 2
|
|
keep_exprs = expressions[:keep_count]
|
|
|
|
# 从剩余的表达方式中随机选择
|
|
remaining_exprs = expressions[keep_count:]
|
|
random.shuffle(remaining_exprs)
|
|
keep_exprs.extend(remaining_exprs[: MAX_EXPRESSION_COUNT - keep_count])
|
|
|
|
expressions = keep_exprs
|
|
|
|
return expressions
|
|
|
|
|
|
def main():
|
|
# 获取所有群聊ID
|
|
style_dirs = glob.glob(os.path.join("data", "expression", "learnt_style", "*"))
|
|
chat_ids = [os.path.basename(d) for d in style_dirs]
|
|
|
|
if not chat_ids:
|
|
print("没有找到任何群聊的表达方式数据")
|
|
return
|
|
|
|
print(f"开始清理 {len(chat_ids)} 个群聊的表达方式数据...")
|
|
|
|
total_style_before = 0
|
|
total_style_after = 0
|
|
total_grammar_before = 0
|
|
total_grammar_after = 0
|
|
|
|
for chat_id in chat_ids:
|
|
print(f"\n处理群聊 {chat_id}:")
|
|
|
|
# 加载表达方式
|
|
style_exprs, grammar_exprs = load_expressions(chat_id)
|
|
|
|
# 记录清理前的数量
|
|
style_count_before = len(style_exprs)
|
|
grammar_count_before = len(grammar_exprs)
|
|
total_style_before += style_count_before
|
|
total_grammar_before += grammar_count_before
|
|
|
|
# 清理表达方式
|
|
style_exprs = cleanup_expressions(style_exprs)
|
|
grammar_exprs = cleanup_expressions(grammar_exprs)
|
|
|
|
# 记录清理后的数量
|
|
style_count_after = len(style_exprs)
|
|
grammar_count_after = len(grammar_exprs)
|
|
total_style_after += style_count_after
|
|
total_grammar_after += grammar_count_after
|
|
|
|
# 保存清理后的表达方式
|
|
save_expressions(chat_id, style_exprs, grammar_exprs)
|
|
|
|
print(f"语言风格: {style_count_before} -> {style_count_after}")
|
|
print(f"句法特点: {grammar_count_before} -> {grammar_count_after}")
|
|
|
|
print("\n清理完成!")
|
|
print(f"语言风格总数: {total_style_before} -> {total_style_after}")
|
|
print(f"句法特点总数: {total_grammar_before} -> {total_grammar_after}")
|
|
print(
|
|
f"总共清理了 {total_style_before + total_grammar_before - total_style_after - total_grammar_after} 条表达方式"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|