feat: 提升语义兴趣评分与拼写错误生成

- 为中文拼写生成器实现了背景预热功能，以提升首次使用时的性能。 - 更新了MessageStorageBatcher以支持可配置的提交批次大小和间隔，优化数据库写入性能。 - 增强版数据集生成器，对样本规模设置硬性限制并提升采样效率。 - 将AutoTrainer中的最大样本数增加至1000，以优化训练数据利用率。 - 对亲和兴趣计算器进行了重构，以避免并发初始化并优化模型加载逻辑。 - 引入批量处理机制用于语义兴趣评分，以应对高频聊天场景。 - 更新了配置模板以反映新的评分参数，并移除了已弃用的兴趣阈值。
2025-12-12 14:11:36 +08:00
parent 9d01b81cef
commit e6a4f855a2
17 changed files with 433 additions and 554 deletions
--- a/src/common/logger.py
+++ b/src/common/logger.py
@@ -1,6 +1,7 @@
 # 使用基于时间戳的文件处理器，简单的轮转份数限制

 import logging
+import os
 import tarfile
 import threading
 import time
@@ -189,6 +190,10 @@ class TimestampedFileHandler(logging.Handler):
        self.backup_count = backup_count
        self.encoding = encoding
        self._lock = threading.Lock()
+        self._current_size = 0
+        self._bytes_since_check = 0
+        self._newline_bytes = len(os.linesep.encode(self.encoding or "utf-8"))
+        self._stat_refresh_threshold = max(self.max_bytes // 8, 256 * 1024)

        # 当前活跃的日志文件
        self.current_file = None
@@ -207,11 +212,29 @@ class TimestampedFileHandler(logging.Handler):
            # 极低概率碰撞，稍作等待
            time.sleep(0.001)
        self.current_stream = open(self.current_file, "a", encoding=self.encoding)
+        self._current_size = self.current_file.stat().st_size if self.current_file.exists() else 0
+        self._bytes_since_check = 0

-    def _should_rollover(self):
-        """检查是否需要轮转"""
-        if self.current_file and self.current_file.exists():
-            return self.current_file.stat().st_size >= self.max_bytes
+    def _should_rollover(self, incoming_size: int = 0) -> bool:
+        """检查是否需要轮转，使用内存缓存的大小信息减少磁盘stat次数。"""
+        if not self.current_file:
+            return False
+
+        projected = self._current_size + incoming_size
+        if projected >= self.max_bytes:
+            return True
+
+        self._bytes_since_check += incoming_size
+        if self._bytes_since_check >= self._stat_refresh_threshold:
+            try:
+                if self.current_file.exists():
+                    self._current_size = self.current_file.stat().st_size
+                else:
+                    self._current_size = 0
+            except OSError:
+                self._current_size = 0
+            finally:
+                self._bytes_since_check = 0
        return False

    def _do_rollover(self):
@@ -270,16 +293,17 @@ class TimestampedFileHandler(logging.Handler):
    def emit(self, record):
        """发出日志记录"""
        try:
+            message = self.format(record)
+            encoded_len = len(message.encode(self.encoding or "utf-8")) + self._newline_bytes
+
            with self._lock:
-                # 检查是否需要轮转
-                if self._should_rollover():
+                if self._should_rollover(encoded_len):
                    self._do_rollover()

-                # 写入日志
                if self.current_stream:
-                    msg = self.format(record)
-                    self.current_stream.write(msg + "\n")
+                    self.current_stream.write(message + "\n")
                    self.current_stream.flush()
+                    self._current_size += encoded_len

        except Exception:
            self.handleError(record)
@@ -837,10 +861,6 @@ DEFAULT_MODULE_ALIASES = {
 }


-# 创建全局 Rich Console 实例用于颜色渲染
-_rich_console = Console(force_terminal=True, color_system="truecolor")
-
-
 class ModuleColoredConsoleRenderer:
    """自定义控制台渲染器，使用 Rich 库原生支持 hex 颜色"""

@@ -848,6 +868,7 @@ class ModuleColoredConsoleRenderer:
        # sourcery skip: merge-duplicate-blocks, remove-redundant-if
        self._colors = colors
        self._config = LOG_CONFIG
+        self._render_console = Console(force_terminal=True, color_system="truecolor", width=999)

        # 日志级别颜色 (#RRGGBB 格式)
        self._level_colors_hex = {
@@ -876,6 +897,22 @@ class ModuleColoredConsoleRenderer:
            self._enable_level_colors = False
            self._enable_full_content_colors = False

+    @staticmethod
+    def _looks_like_markup(content: str) -> bool:
+        """快速判断内容里是否包含 Rich 标记，避免不必要的解析开销。"""
+        if not content:
+            return False
+        return "[" in content and "]" in content
+
+    def _render_content_text(self, content: str, *, style: str | None = None) -> Text:
+        """只在必要时解析 Rich 标记，降低CPU占用。"""
+        if self._looks_like_markup(content):
+            try:
+                return Text.from_markup(content, style=style)
+            except Exception:
+                return Text(content, style=style)
+        return Text(content, style=style)
+
    def __call__(self, logger, method_name, event_dict):
        # sourcery skip: merge-duplicate-blocks
        """渲染日志消息"""
@@ -966,9 +1003,9 @@ class ModuleColoredConsoleRenderer:
                if prefix:
                    # 解析 prefix 中的 Rich 标记
                    if module_hex_color:
-                        content_text.append(Text.from_markup(prefix, style=module_hex_color))
+                        content_text.append(self._render_content_text(prefix, style=module_hex_color))
                    else:
-                        content_text.append(Text.from_markup(prefix))
+                        content_text.append(self._render_content_text(prefix))

                # 与"内心思考"段落之间插入空行
                if prefix:
@@ -983,24 +1020,12 @@ class ModuleColoredConsoleRenderer:
            else:
                # 使用 Text.from_markup 解析 Rich 标记语言
                if module_hex_color:
-                    try:
-                        parts.append(Text.from_markup(event_content, style=module_hex_color))
-                    except Exception:
-                        # 如果标记解析失败，回退到普通文本
-                        parts.append(Text(event_content, style=module_hex_color))
+                    parts.append(self._render_content_text(event_content, style=module_hex_color))
                else:
-                    try:
-                        parts.append(Text.from_markup(event_content))
-                    except Exception:
-                        # 如果标记解析失败，回退到普通文本
-                        parts.append(Text(event_content))
+                    parts.append(self._render_content_text(event_content))
        else:
            # 即使在非 full 模式下，也尝试解析 Rich 标记（但不应用颜色）
-            try:
-                parts.append(Text.from_markup(event_content))
-            except Exception:
-                # 如果标记解析失败，使用普通文本
-                parts.append(Text(event_content))
+            parts.append(self._render_content_text(event_content))

        # 处理其他字段
        extras = []
@@ -1029,12 +1054,10 @@ class ModuleColoredConsoleRenderer:

        # 使用 Rich 拼接并返回字符串
        result = Text(" ").join(parts)
-        # 将 Rich Text 对象转换为带 ANSI 颜色码的字符串
-        from io import StringIO
-        string_io = StringIO()
-        temp_console = Console(file=string_io, force_terminal=True, color_system="truecolor", width=999)
-        temp_console.print(result, end="")
-        return string_io.getvalue()
+        # 使用持久化 Console + capture 避免每条日志重复实例化
+        with self._render_console.capture() as capture:
+            self._render_console.print(result, end="")
+        return capture.get()


 # 配置标准logging以支持文件输出和压缩