feat: 提升语义兴趣评分与拼写错误生成

- 为中文拼写生成器实现了背景预热功能，以提升首次使用时的性能。 - 更新了MessageStorageBatcher以支持可配置的提交批次大小和间隔，优化数据库写入性能。 - 增强版数据集生成器，对样本规模设置硬性限制并提升采样效率。 - 将AutoTrainer中的最大样本数增加至1000，以优化训练数据利用率。 - 对亲和兴趣计算器进行了重构，以避免并发初始化并优化模型加载逻辑。 - 引入批量处理机制用于语义兴趣评分，以应对高频聊天场景。 - 更新了配置模板以反映新的评分参数，并移除了已弃用的兴趣阈值。
2025-12-12 14:11:36 +08:00
parent 9d01b81cef
commit e6a4f855a2
17 changed files with 433 additions and 554 deletions
--- a/src/chat/semantic_interest/trainer.py
+++ b/src/chat/semantic_interest/trainer.py
@@ -191,44 +191,3 @@ class SemanticInterestTrainer:

        return dataset_path, model_path, metrics

-
-async def main():
-    """示例：训练一个语义兴趣度模型"""
-
-    # 示例人格信息
-    persona_info = {
-        "name": "小狐",
-        "interests": ["动漫", "游戏", "编程", "技术", "二次元"],
-        "dislikes": ["广告", "政治", "无聊闲聊"],
-        "personality": "活泼开朗，对新鲜事物充满好奇",
-    }
-
-    # 创建训练器
-    trainer = SemanticInterestTrainer()
-
-    # 执行完整训练流程
-    dataset_path, model_path, metrics = await trainer.full_training_pipeline(
-        persona_info=persona_info,
-        days=7,  # 使用最近 7 天的消息
-        max_samples=500,  # 采样 500 条消息
-        llm_model_name=None,  # 使用默认 LLM
-        tfidf_config={
-            "analyzer": "char",
-            "ngram_range": (2, 4),
-            "max_features": 15000,
-            "min_df": 3,
-        },
-        model_config={
-            "class_weight": "balanced",
-            "max_iter": 1000,
-        },
-    )
-
-    print(f"\n训练完成！")
-    print(f"数据集: {dataset_path}")
-    print(f"模型: {model_path}")
-    print(f"准确率: {metrics.get('test_accuracy', 0):.4f}")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())