feat(reply): 引入增强的格式过滤器,优化回复内容清理逻辑
This commit is contained in:
@@ -1812,23 +1812,8 @@ class DefaultReplyer:
|
|||||||
# 移除 [SPLIT] 标记,防止消息被分割
|
# 移除 [SPLIT] 标记,防止消息被分割
|
||||||
content = content.replace("[SPLIT]", "")
|
content = content.replace("[SPLIT]", "")
|
||||||
|
|
||||||
|
# 应用增强的格式过滤器
|
||||||
original_content_for_filter = content
|
content = self._apply_system_format_filter(content)
|
||||||
cleaned_content = content.strip()
|
|
||||||
|
|
||||||
# 终极过滤器:处理模型模仿的、可能存在多层嵌套的回复格式
|
|
||||||
# 规则:如果消息以 "[回复" 开头,则删除从开头到最后一个 "]" 的所有内容。
|
|
||||||
if cleaned_content.startswith("[回复"):
|
|
||||||
last_bracket_index = cleaned_content.rfind("]")
|
|
||||||
if last_bracket_index != -1:
|
|
||||||
cleaned_content = cleaned_content[last_bracket_index + 1 :].strip()
|
|
||||||
|
|
||||||
if cleaned_content != original_content_for_filter.strip():
|
|
||||||
logger.warning(
|
|
||||||
"检测到并清理了模型生成的不规范回复格式。"
|
|
||||||
f"原始内容: '{original_content_for_filter}', 清理后: '{cleaned_content}'"
|
|
||||||
)
|
|
||||||
content = cleaned_content
|
|
||||||
|
|
||||||
logger.debug(f"replyer生成内容: {content}")
|
logger.debug(f"replyer生成内容: {content}")
|
||||||
return content, reasoning_content, model_name, tool_calls
|
return content, reasoning_content, model_name, tool_calls
|
||||||
@@ -2115,6 +2100,72 @@ class DefaultReplyer:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"存储聊天记忆失败: {e}")
|
logger.error(f"存储聊天记忆失败: {e}")
|
||||||
|
|
||||||
|
def _apply_system_format_filter(self, content: str) -> str:
|
||||||
|
"""
|
||||||
|
应用增强的系统格式过滤器,移除各种系统格式化文本
|
||||||
|
|
||||||
|
此方法过滤以下类型的系统格式化内容:
|
||||||
|
1. 回复格式:[回复xxx],说:xxx
|
||||||
|
2. 表情包格式:[表情包:xxx]
|
||||||
|
3. 图片格式:[图片:xxx]
|
||||||
|
4. @格式:@<xxx>
|
||||||
|
5. 原有的[回复开头格式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: 原始内容
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
过滤后的内容
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
return content
|
||||||
|
|
||||||
|
original_content = content
|
||||||
|
cleaned_content = content.strip()
|
||||||
|
|
||||||
|
# 1. 移除回复格式:[回复xxx],说:xxx(各种变体)
|
||||||
|
# 匹配所有包含"],说:"格式的回复
|
||||||
|
cleaned_content = re.sub(r"\[回复[^\]]*\],说:\s*", "", cleaned_content)
|
||||||
|
# 匹配 [回复<xxx:数字>],说:xxx 格式
|
||||||
|
cleaned_content = re.sub(r"\[回复<[^>]*>\],说:\s*", "", cleaned_content)
|
||||||
|
|
||||||
|
# 2. 处理原有的[回复开头格式(保持向后兼容)
|
||||||
|
# 注意:这步要在上面处理完成后再执行,避免冲突
|
||||||
|
if cleaned_content.startswith("[回复"):
|
||||||
|
last_bracket_index = cleaned_content.rfind("]")
|
||||||
|
if last_bracket_index != -1:
|
||||||
|
cleaned_content = cleaned_content[last_bracket_index + 1 :].strip()
|
||||||
|
|
||||||
|
# 3. 移除表情包格式:[表情包:xxx]
|
||||||
|
cleaned_content = re.sub(r"\[表情包:[^\]]*\]", "", cleaned_content)
|
||||||
|
|
||||||
|
# 4. 移除图片格式:[图片:xxx]
|
||||||
|
cleaned_content = re.sub(r"\[图片:[^\]]*\]", "", cleaned_content)
|
||||||
|
|
||||||
|
# 5. 移除@格式:@<xxx>
|
||||||
|
cleaned_content = re.sub(r"@<[^>]*>", "", cleaned_content)
|
||||||
|
|
||||||
|
# 6. 移除其他可能的系统格式
|
||||||
|
# [表情包(描述生成失败)] 等错误格式
|
||||||
|
cleaned_content = re.sub(r"\[表情包\([^)]*\)\]", "", cleaned_content)
|
||||||
|
# [图片(描述生成失败)] 等错误格式
|
||||||
|
cleaned_content = re.sub(r"\[图片\([^)]*\)\]", "", cleaned_content)
|
||||||
|
|
||||||
|
# 清理多余空格
|
||||||
|
cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip()
|
||||||
|
|
||||||
|
# 记录过滤操作
|
||||||
|
if cleaned_content != original_content.strip():
|
||||||
|
logger.info(
|
||||||
|
f"[格式过滤器] 检测到并清理了系统格式化文本。"
|
||||||
|
f"原始内容: '{original_content}', "
|
||||||
|
f"清理后: '{cleaned_content}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
return cleaned_content
|
||||||
|
|
||||||
|
|
||||||
def weighted_sample_no_replacement(items, weights, k) -> list:
|
def weighted_sample_no_replacement(items, weights, k) -> list:
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user