diff --git a/src/chat/replyer/default_generator.py b/src/chat/replyer/default_generator.py index e389a0528..7818ed927 100644 --- a/src/chat/replyer/default_generator.py +++ b/src/chat/replyer/default_generator.py @@ -1811,24 +1811,9 @@ class DefaultReplyer: if content: # 移除 [SPLIT] 标记,防止消息被分割 content = content.replace("[SPLIT]", "") - - original_content_for_filter = content - cleaned_content = content.strip() - - # 终极过滤器:处理模型模仿的、可能存在多层嵌套的回复格式 - # 规则:如果消息以 "[回复" 开头,则删除从开头到最后一个 "]" 的所有内容。 - if cleaned_content.startswith("[回复"): - last_bracket_index = cleaned_content.rfind("]") - if last_bracket_index != -1: - cleaned_content = cleaned_content[last_bracket_index + 1 :].strip() - - if cleaned_content != original_content_for_filter.strip(): - logger.warning( - "检测到并清理了模型生成的不规范回复格式。" - f"原始内容: '{original_content_for_filter}', 清理后: '{cleaned_content}'" - ) - content = cleaned_content + # 应用增强的格式过滤器 + content = self._apply_system_format_filter(content) logger.debug(f"replyer生成内容: {content}") return content, reasoning_content, model_name, tool_calls @@ -2115,6 +2100,72 @@ class DefaultReplyer: except Exception as e: logger.error(f"存储聊天记忆失败: {e}") + def _apply_system_format_filter(self, content: str) -> str: + """ + 应用增强的系统格式过滤器,移除各种系统格式化文本 + + 此方法过滤以下类型的系统格式化内容: + 1. 回复格式:[回复xxx],说:xxx + 2. 表情包格式:[表情包:xxx] + 3. 图片格式:[图片:xxx] + 4. @格式:@ + 5. 原有的[回复开头格式 + + Args: + content: 原始内容 + + Returns: + 过滤后的内容 + """ + import re + + if not content: + return content + + original_content = content + cleaned_content = content.strip() + + # 1. 移除回复格式:[回复xxx],说:xxx(各种变体) + # 匹配所有包含"],说:"格式的回复 + cleaned_content = re.sub(r"\[回复[^\]]*\],说:\s*", "", cleaned_content) + # 匹配 [回复],说:xxx 格式 + cleaned_content = re.sub(r"\[回复<[^>]*>\],说:\s*", "", cleaned_content) + + # 2. 处理原有的[回复开头格式(保持向后兼容) + # 注意:这步要在上面处理完成后再执行,避免冲突 + if cleaned_content.startswith("[回复"): + last_bracket_index = cleaned_content.rfind("]") + if last_bracket_index != -1: + cleaned_content = cleaned_content[last_bracket_index + 1 :].strip() + + # 3. 移除表情包格式:[表情包:xxx] + cleaned_content = re.sub(r"\[表情包:[^\]]*\]", "", cleaned_content) + + # 4. 移除图片格式:[图片:xxx] + cleaned_content = re.sub(r"\[图片:[^\]]*\]", "", cleaned_content) + + # 5. 移除@格式:@ + cleaned_content = re.sub(r"@<[^>]*>", "", cleaned_content) + + # 6. 移除其他可能的系统格式 + # [表情包(描述生成失败)] 等错误格式 + cleaned_content = re.sub(r"\[表情包\([^)]*\)\]", "", cleaned_content) + # [图片(描述生成失败)] 等错误格式 + cleaned_content = re.sub(r"\[图片\([^)]*\)\]", "", cleaned_content) + + # 清理多余空格 + cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip() + + # 记录过滤操作 + if cleaned_content != original_content.strip(): + logger.info( + f"[格式过滤器] 检测到并清理了系统格式化文本。" + f"原始内容: '{original_content}', " + f"清理后: '{cleaned_content}'" + ) + + return cleaned_content + def weighted_sample_no_replacement(items, weights, k) -> list: """