feat(reply): 引入统一格式过滤器,优化回复内容清理逻辑

This commit is contained in:
Windpicker-owo
2025-11-10 14:12:11 +08:00
parent 259a744a3b
commit b427960441
3 changed files with 79 additions and 69 deletions

View File

@@ -1812,8 +1812,9 @@ class DefaultReplyer:
# 移除 [SPLIT] 标记,防止消息被分割
content = content.replace("[SPLIT]", "")
# 应用增强的格式过滤器
content = self._apply_system_format_filter(content)
# 应用统一的格式过滤器
from src.chat.utils.utils import filter_system_format_content
content = filter_system_format_content(content)
logger.debug(f"replyer生成内容: {content}")
return content, reasoning_content, model_name, tool_calls
@@ -2100,72 +2101,7 @@ class DefaultReplyer:
except Exception as e:
logger.error(f"存储聊天记忆失败: {e}")
def _apply_system_format_filter(self, content: str) -> str:
"""
应用增强的系统格式过滤器,移除各种系统格式化文本
此方法过滤以下类型的系统格式化内容:
1. 回复格式:[回复xxx]xxx
2. 表情包格式:[表情包xxx]
3. 图片格式:[图片:xxx]
4. @格式:@<xxx>
5. 原有的[回复开头格式
Args:
content: 原始内容
Returns:
过滤后的内容
"""
import re
if not content:
return content
original_content = content
cleaned_content = content.strip()
# 1. 移除回复格式:[回复xxx]xxx各种变体
# 匹配所有包含"],说:"格式的回复
cleaned_content = re.sub(r"\[回复[^\]]*\],说:\s*", "", cleaned_content)
# 匹配 [回复<xxx:数字>]xxx 格式
cleaned_content = re.sub(r"\[回复<[^>]*>\],说:\s*", "", cleaned_content)
# 2. 处理原有的[回复开头格式(保持向后兼容)
# 注意:这步要在上面处理完成后再执行,避免冲突
if cleaned_content.startswith("[回复"):
last_bracket_index = cleaned_content.rfind("]")
if last_bracket_index != -1:
cleaned_content = cleaned_content[last_bracket_index + 1 :].strip()
# 3. 移除表情包格式:[表情包xxx]
cleaned_content = re.sub(r"\[表情包:[^\]]*\]", "", cleaned_content)
# 4. 移除图片格式:[图片:xxx]
cleaned_content = re.sub(r"\[图片:[^\]]*\]", "", cleaned_content)
# 5. 移除@格式:@<xxx>
cleaned_content = re.sub(r"@<[^>]*>", "", cleaned_content)
# 6. 移除其他可能的系统格式
# [表情包(描述生成失败)] 等错误格式
cleaned_content = re.sub(r"\[表情包\([^)]*\)\]", "", cleaned_content)
# [图片(描述生成失败)] 等错误格式
cleaned_content = re.sub(r"\[图片\([^)]*\)\]", "", cleaned_content)
# 清理多余空格
cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip()
# 记录过滤操作
if cleaned_content != original_content.strip():
logger.info(
f"[格式过滤器] 检测到并清理了系统格式化文本。"
f"原始内容: '{original_content}', "
f"清理后: '{cleaned_content}'"
)
return cleaned_content
def weighted_sample_no_replacement(items, weights, k) -> list:
"""

View File

@@ -923,3 +923,69 @@ def assign_message_ids_flexible(
# # 增强版本 - 使用时间戳
# result3 = assign_message_ids_flexible(messages, prefix="ts", use_timestamp=True)
# # 结果: [{'id': 'ts123a1b', 'message': 'Hello'}, {'id': 'ts123c2d', 'message': 'World'}, {'id': 'ts123e3f', 'message': 'Test message'}]
def filter_system_format_content(content: str | None) -> str:
"""
过滤系统格式化内容,移除回复、@、图片、表情包等系统生成的格式文本
此方法过滤以下类型的系统格式化内容:
1. 回复格式:[回复xxx]xxx
2. 表情包格式:[表情包xxx]
3. 图片格式:[图片:xxx]
4. @格式:@<xxx>
5. 错误格式:[表情包(...)]、[图片(...)]
6. [回复开头的格式
Args:
content: 原始内容
Returns:
过滤后的纯文本内容
"""
if not content:
return ""
original_content = content
cleaned_content = content.strip()
# 1. 移除回复格式:[回复xxx]xxx各种变体
# 匹配所有包含"],说:"格式的回复
cleaned_content = re.sub(r"\[回复[^\]]*\],说:\s*", "", cleaned_content)
# 匹配 [回复<xxx:数字>]xxx 格式
cleaned_content = re.sub(r"\[回复<[^>]*>\],说:\s*", "", cleaned_content)
# 2. 处理原有的[回复开头格式(保持向后兼容)
# 注意:这步要在上面处理完成后再执行,避免冲突
if cleaned_content.startswith("[回复"):
last_bracket_index = cleaned_content.rfind("]")
if last_bracket_index != -1:
cleaned_content = cleaned_content[last_bracket_index + 1 :].strip()
# 3. 移除表情包格式:[表情包xxx]
cleaned_content = re.sub(r"\[表情包:[^\]]*\]", "", cleaned_content)
# 4. 移除图片格式:[图片:xxx]
cleaned_content = re.sub(r"\[图片:[^\]]*\]", "", cleaned_content)
# 5. 移除@格式:@<xxx>
cleaned_content = re.sub(r"@<[^>]*>", "", cleaned_content)
# 6. 移除其他可能的系统格式
# [表情包(描述生成失败)] 等错误格式
cleaned_content = re.sub(r"\[表情包\([^)]*\)\]", "", cleaned_content)
# [图片(描述生成失败)] 等错误格式
cleaned_content = re.sub(r"\[图片\([^)]*\)\]", "", cleaned_content)
# 清理多余空格
cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip()
# 记录过滤操作
if cleaned_content != original_content.strip():
logger.info(
f"[系统格式过滤器] 检测到并清理了系统格式化文本。"
f"原始内容: '{original_content}', "
f"清理后: '{cleaned_content}'"
)
return cleaned_content

View File

@@ -412,7 +412,15 @@ class ProactiveThinkingPlanner:
return None
logger.info(f"生成回复成功: {response[:50]}...")
return response.strip()
# 应用格式过滤器,确保回复内容不包含系统格式化文本
from src.chat.utils.utils import filter_system_format_content
filtered_response = filter_system_format_content(response.strip())
if filtered_response != response.strip():
logger.debug(f"主动思考回复已过滤系统格式: '{response.strip()}' -> '{filtered_response}'")
return filtered_response
except Exception as e:
logger.error(f"生成回复失败: {e}", exc_info=True)