feat(expression): 增强表达学习与选择系统的健壮性和智能匹配

- 改进表达学习器的提示词格式规范，增强LLM输出解析的容错性 - 优化表达选择器的模型预测模式，添加情境提取和模糊匹配机制 - 增强StyleLearner的错误处理和日志记录，提高训练和预测的稳定性 - 改进流循环管理器的日志输出，避免重复信息刷屏 - 扩展SendAPI的消息查找功能，支持DatabaseMessages对象兼容 - 添加智能回退机制，当模型预测失败时自动切换到经典模式 - 优化数据库查询逻辑，支持跨聊天流的表达方式共享 BREAKING CHANGE: 表达选择器的模型预测模式现在需要情境提取器配合使用，旧版本配置可能需要更新依赖关系
2025-10-30 11:16:30 +08:00
parent 21167a2685
commit 9372f6d31c
9 changed files with 816 additions and 82 deletions
--- a/src/chat/express/situation_extractor.py
+++ b/src/chat/express/situation_extractor.py
@@ -0,0 +1,162 @@
+"""
+情境提取器
+从聊天历史中提取当前的情境（situation），用于 StyleLearner 预测
+"""
+from typing import Optional
+
+from src.chat.utils.prompt import Prompt, global_prompt_manager
+from src.common.logger import get_logger
+from src.config.config import global_config, model_config
+from src.llm_models.utils_model import LLMRequest
+
+logger = get_logger("situation_extractor")
+
+
+def init_prompt():
+    situation_extraction_prompt = """
+以下是正在进行的聊天内容：
+{chat_history}
+
+你的名字是{bot_name}{target_message_info}
+
+请分析当前聊天的情境特征，提取出最能描述当前情境的1-3个关键场景描述。
+
+场景描述应该：
+1. 简洁明了（每个不超过20个字）
+2. 聚焦情绪、话题、氛围
+3. 不涉及具体人名
+4. 类似于"表示惊讶"、"讨论游戏"、"表达赞同"这样的格式
+
+请以纯文本格式输出，每行一个场景描述，不要有序号、引号或其他格式：
+
+例如：
+表示惊讶和意外
+讨论技术问题
+表达友好的赞同
+
+现在请提取当前聊天的情境：
+"""
+    Prompt(situation_extraction_prompt, "situation_extraction_prompt")
+
+
+class SituationExtractor:
+    """情境提取器，从聊天历史中提取当前情境"""
+    
+    def __init__(self):
+        self.llm_model = LLMRequest(
+            model_set=model_config.model_task_config.utils_small,
+            request_type="expression.situation_extractor"
+        )
+    
+    async def extract_situations(
+        self,
+        chat_history: list | str,
+        target_message: Optional[str] = None,
+        max_situations: int = 3
+    ) -> list[str]:
+        """
+        从聊天历史中提取情境
+        
+        Args:
+            chat_history: 聊天历史（列表或字符串）
+            target_message: 目标消息（可选）
+            max_situations: 最多提取的情境数量
+            
+        Returns:
+            情境描述列表
+        """
+        # 转换chat_history为字符串
+        if isinstance(chat_history, list):
+            chat_info = "\n".join([
+                f"{msg.get('sender', 'Unknown')}: {msg.get('content', '')}" 
+                for msg in chat_history
+            ])
+        else:
+            chat_info = chat_history
+        
+        # 构建目标消息信息
+        if target_message:
+            target_message_info = f"，现在你想要回复消息：{target_message}"
+        else:
+            target_message_info = ""
+        
+        # 构建 prompt
+        try:
+            prompt = (await global_prompt_manager.get_prompt_async("situation_extraction_prompt")).format(
+                bot_name=global_config.bot.nickname,
+                chat_history=chat_info,
+                target_message_info=target_message_info
+            )
+            
+            # 调用 LLM
+            response, _ = await self.llm_model.generate_response_async(
+                prompt=prompt,
+                temperature=0.3
+            )
+            
+            if not response or not response.strip():
+                logger.warning("LLM返回空响应，无法提取情境")
+                return []
+            
+            # 解析响应
+            situations = self._parse_situations(response, max_situations)
+            
+            if situations:
+                logger.debug(f"提取到 {len(situations)} 个情境: {situations}")
+            else:
+                logger.warning(f"无法从LLM响应中解析出情境。响应:\n{response}")
+            
+            return situations
+            
+        except Exception as e:
+            logger.error(f"提取情境失败: {e}")
+            return []
+    
+    @staticmethod
+    def _parse_situations(response: str, max_situations: int) -> list[str]:
+        """
+        解析 LLM 返回的情境描述
+        
+        Args:
+            response: LLM 响应
+            max_situations: 最多返回的情境数量
+            
+        Returns:
+            情境描述列表
+        """
+        situations = []
+        
+        for line in response.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            
+            # 移除可能的序号、引号等
+            line = line.lstrip('0123456789.、-*>）)】] \t"\'""''')
+            line = line.rstrip('"\'""''')
+            line = line.strip()
+            
+            if not line:
+                continue
+            
+            # 过滤掉明显不是情境描述的内容
+            if len(line) > 30:  # 太长
+                continue
+            if len(line) < 2:   # 太短
+                continue
+            if any(keyword in line.lower() for keyword in ['例如', '注意', '请', '分析', '总结']):
+                continue
+            
+            situations.append(line)
+            
+            if len(situations) >= max_situations:
+                break
+        
+        return situations
+
+
+# 初始化 prompt
+init_prompt()
+
+# 全局单例
+situation_extractor = SituationExtractor()