Mofox-Core/scripts/cleanup_expressions.py

import os
import json
import random
from typing import List, Dict, Tuple
import glob

MAX_EXPRESSION_COUNT = 300  # 每个群最多保留的表达方式数量
MIN_COUNT_THRESHOLD = 0.01  # 最小使用次数阈值


def load_expressions(chat_id: str) -> Tuple[List[Dict], List[Dict]]:
    """加载指定群聊的表达方式"""
    style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
    grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")

    style_exprs = []
    grammar_exprs = []

    if os.path.exists(style_file):
        with open(style_file, "r", encoding="utf-8") as f:
            style_exprs = json.load(f)

    if os.path.exists(grammar_file):
        with open(grammar_file, "r", encoding="utf-8") as f:
            grammar_exprs = json.load(f)

    return style_exprs, grammar_exprs


def save_expressions(chat_id: str, style_exprs: List[Dict], grammar_exprs: List[Dict]) -> None:
    """保存表达方式到文件"""
    style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
    grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")

    os.makedirs(os.path.dirname(style_file), exist_ok=True)
    os.makedirs(os.path.dirname(grammar_file), exist_ok=True)

    with open(style_file, "w", encoding="utf-8") as f:
        json.dump(style_exprs, f, ensure_ascii=False, indent=2)

    with open(grammar_file, "w", encoding="utf-8") as f:
        json.dump(grammar_exprs, f, ensure_ascii=False, indent=2)


def cleanup_expressions(expressions: List[Dict]) -> List[Dict]:
    """清理表达方式列表"""
    if not expressions:
        return []

    # 1. 移除使用次数过低的表达方式
    expressions = [expr for expr in expressions if expr.get("count", 0) > MIN_COUNT_THRESHOLD]

    # 2. 如果数量超过限制，随机删除多余的
    if len(expressions) > MAX_EXPRESSION_COUNT:
        # 按使用次数排序
        expressions.sort(key=lambda x: x.get("count", 0), reverse=True)

        # 保留前50%的高频表达方式
        keep_count = MAX_EXPRESSION_COUNT // 2
        keep_exprs = expressions[:keep_count]

        # 从剩余的表达方式中随机选择
        remaining_exprs = expressions[keep_count:]
        random.shuffle(remaining_exprs)
        keep_exprs.extend(remaining_exprs[: MAX_EXPRESSION_COUNT - keep_count])

        expressions = keep_exprs

    return expressions


def main():
    # 获取所有群聊ID
    style_dirs = glob.glob(os.path.join("data", "expression", "learnt_style", "*"))
    chat_ids = [os.path.basename(d) for d in style_dirs]

    if not chat_ids:
        print("没有找到任何群聊的表达方式数据")
        return

    print(f"开始清理 {len(chat_ids)} 个群聊的表达方式数据...")

    total_style_before = 0
    total_style_after = 0
    total_grammar_before = 0
    total_grammar_after = 0

    for chat_id in chat_ids:
        print(f"\n处理群聊 {chat_id}:")

        # 加载表达方式
        style_exprs, grammar_exprs = load_expressions(chat_id)

        # 记录清理前的数量
        style_count_before = len(style_exprs)
        grammar_count_before = len(grammar_exprs)
        total_style_before += style_count_before
        total_grammar_before += grammar_count_before

        # 清理表达方式
        style_exprs = cleanup_expressions(style_exprs)
        grammar_exprs = cleanup_expressions(grammar_exprs)

        # 记录清理后的数量
        style_count_after = len(style_exprs)
        grammar_count_after = len(grammar_exprs)
        total_style_after += style_count_after
        total_grammar_after += grammar_count_after

        # 保存清理后的表达方式
        save_expressions(chat_id, style_exprs, grammar_exprs)

        print(f"语言风格: {style_count_before} -> {style_count_after}")
        print(f"句法特点: {grammar_count_before} -> {grammar_count_after}")

    print("\n清理完成！")
    print(f"语言风格总数: {total_style_before} -> {total_style_after}")
    print(f"句法特点总数: {total_grammar_before} -> {total_grammar_after}")
    print(
        f"总共清理了 {total_style_before + total_grammar_before - total_style_after - total_grammar_after} 条表达方式"
    )


if __name__ == "__main__":
    main()