feat: 提升语义兴趣评分与拼写错误生成
- 为中文拼写生成器实现了背景预热功能,以提升首次使用时的性能。 - 更新了MessageStorageBatcher以支持可配置的提交批次大小和间隔,优化数据库写入性能。 - 增强版数据集生成器,对样本规模设置硬性限制并提升采样效率。 - 将AutoTrainer中的最大样本数增加至1000,以优化训练数据利用率。 - 对亲和兴趣计算器进行了重构,以避免并发初始化并优化模型加载逻辑。 - 引入批量处理机制用于语义兴趣评分,以应对高频聊天场景。 - 更新了配置模板以反映新的评分参数,并移除了已弃用的兴趣阈值。
This commit is contained in:
@@ -96,7 +96,7 @@ class ChineseTypoGenerator:
|
||||
|
||||
# 🔧 内存优化:复用全局缓存的拼音字典和字频数据
|
||||
if _shared_pinyin_dict is None:
|
||||
_shared_pinyin_dict = self._create_pinyin_dict()
|
||||
_shared_pinyin_dict = self._load_or_create_pinyin_dict()
|
||||
logger.debug("拼音字典已创建并缓存")
|
||||
self.pinyin_dict = _shared_pinyin_dict
|
||||
|
||||
@@ -141,6 +141,35 @@ class ChineseTypoGenerator:
|
||||
|
||||
return normalized_freq
|
||||
|
||||
def _load_or_create_pinyin_dict(self):
|
||||
"""
|
||||
加载或创建拼音到汉字映射字典(磁盘缓存加速冷启动)
|
||||
"""
|
||||
cache_file = Path("depends-data/pinyin_dict.json")
|
||||
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, encoding="utf-8") as f:
|
||||
data = orjson.loads(f.read())
|
||||
# 恢复为 defaultdict(list) 以兼容旧逻辑
|
||||
restored = defaultdict(list)
|
||||
for py, chars in data.items():
|
||||
restored[py] = list(chars)
|
||||
return restored
|
||||
except Exception as e:
|
||||
logger.warning(f"读取拼音缓存失败,将重新生成: {e}")
|
||||
|
||||
pinyin_dict = self._create_pinyin_dict()
|
||||
|
||||
try:
|
||||
cache_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(cache_file, "w", encoding="utf-8") as f:
|
||||
f.write(orjson.dumps(dict(pinyin_dict), option=orjson.OPT_INDENT_2).decode("utf-8"))
|
||||
except Exception as e:
|
||||
logger.warning(f"写入拼音缓存失败(不影响使用): {e}")
|
||||
|
||||
return pinyin_dict
|
||||
|
||||
@staticmethod
|
||||
def _create_pinyin_dict():
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user