diff --git a/src/chat/replyer/default_generator.py b/src/chat/replyer/default_generator.py index f9678c231..0e535f181 100644 --- a/src/chat/replyer/default_generator.py +++ b/src/chat/replyer/default_generator.py @@ -1841,8 +1841,9 @@ class DefaultReplyer: # 移除 [SPLIT] 标记,防止消息被分割 content = content.replace("[SPLIT]", "") - # 应用增强的格式过滤器 - content = self._apply_system_format_filter(content) + # 应用统一的格式过滤器 + from src.chat.utils.utils import filter_system_format_content + content = filter_system_format_content(content) logger.debug(f"replyer生成内容: {content}") return content, reasoning_content, model_name, tool_calls @@ -2129,72 +2130,7 @@ class DefaultReplyer: except Exception as e: logger.error(f"存储聊天记忆失败: {e}") - def _apply_system_format_filter(self, content: str) -> str: - """ - 应用增强的系统格式过滤器,移除各种系统格式化文本 - - 此方法过滤以下类型的系统格式化内容: - 1. 回复格式:[回复xxx],说:xxx - 2. 表情包格式:[表情包:xxx] - 3. 图片格式:[图片:xxx] - 4. @格式:@ - 5. 原有的[回复开头格式 - - Args: - content: 原始内容 - - Returns: - 过滤后的内容 - """ - import re - - if not content: - return content - - original_content = content - cleaned_content = content.strip() - - # 1. 移除回复格式:[回复xxx],说:xxx(各种变体) - # 匹配所有包含"],说:"格式的回复 - cleaned_content = re.sub(r"\[回复[^\]]*\],说:\s*", "", cleaned_content) - # 匹配 [回复],说:xxx 格式 - cleaned_content = re.sub(r"\[回复<[^>]*>\],说:\s*", "", cleaned_content) - - # 2. 处理原有的[回复开头格式(保持向后兼容) - # 注意:这步要在上面处理完成后再执行,避免冲突 - if cleaned_content.startswith("[回复"): - last_bracket_index = cleaned_content.rfind("]") - if last_bracket_index != -1: - cleaned_content = cleaned_content[last_bracket_index + 1 :].strip() - - # 3. 移除表情包格式:[表情包:xxx] - cleaned_content = re.sub(r"\[表情包:[^\]]*\]", "", cleaned_content) - - # 4. 移除图片格式:[图片:xxx] - cleaned_content = re.sub(r"\[图片:[^\]]*\]", "", cleaned_content) - - # 5. 移除@格式:@ - cleaned_content = re.sub(r"@<[^>]*>", "", cleaned_content) - - # 6. 移除其他可能的系统格式 - # [表情包(描述生成失败)] 等错误格式 - cleaned_content = re.sub(r"\[表情包\([^)]*\)\]", "", cleaned_content) - # [图片(描述生成失败)] 等错误格式 - cleaned_content = re.sub(r"\[图片\([^)]*\)\]", "", cleaned_content) - - # 清理多余空格 - cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip() - - # 记录过滤操作 - if cleaned_content != original_content.strip(): - logger.info( - f"[格式过滤器] 检测到并清理了系统格式化文本。" - f"原始内容: '{original_content}', " - f"清理后: '{cleaned_content}'" - ) - - return cleaned_content - + def weighted_sample_no_replacement(items, weights, k) -> list: """ diff --git a/src/chat/utils/utils.py b/src/chat/utils/utils.py index 8cd396115..7ac906cc6 100644 --- a/src/chat/utils/utils.py +++ b/src/chat/utils/utils.py @@ -928,67 +928,68 @@ def assign_message_ids_flexible( # result3 = assign_message_ids_flexible(messages, prefix="ts", use_timestamp=True) # # 结果: [{'id': 'ts123a1b', 'message': 'Hello'}, {'id': 'ts123c2d', 'message': 'World'}, {'id': 'ts123e3f', 'message': 'Test message'}] -def parse_keywords_string(keywords_input) -> list[str]: + +def filter_system_format_content(content: str | None) -> str: """ - 统一的关键词解析函数,支持多种格式的关键词字符串解析 - - 支持的格式: - 1. 字符串列表格式:'["utils.py", "修改", "代码", "动作"]' - 2. 斜杠分隔格式:'utils.py/修改/代码/动作' - 3. 逗号分隔格式:'utils.py,修改,代码,动作' - 4. 空格分隔格式:'utils.py 修改 代码 动作' - 5. 已经是列表的情况:["utils.py", "修改", "代码", "动作"] - 6. JSON格式字符串:'{"keywords": ["utils.py", "修改", "代码", "动作"]}' - + 过滤系统格式化内容,移除回复、@、图片、表情包等系统生成的格式文本 + + 此方法过滤以下类型的系统格式化内容: + 1. 回复格式:[回复xxx],说:xxx + 2. 表情包格式:[表情包:xxx] + 3. 图片格式:[图片:xxx] + 4. @格式:@ + 5. 错误格式:[表情包(...)]、[图片(...)] + 6. [回复开头的格式 + Args: - keywords_input: 关键词输入,可以是字符串或列表 - + content: 原始内容 + Returns: - list[str]: 解析后的关键词列表,去除空白项 + 过滤后的纯文本内容 """ - if not keywords_input: - return [] - - # 如果已经是列表,直接处理 - if isinstance(keywords_input, list): - return [str(k).strip() for k in keywords_input if str(k).strip()] - - # 转换为字符串处理 - keywords_str = str(keywords_input).strip() - if not keywords_str: - return [] - - try: - # 尝试作为JSON对象解析(支持 {"keywords": [...]} 格式) - import json - json_data = json.loads(keywords_str) - if isinstance(json_data, dict) and "keywords" in json_data: - keywords_list = json_data["keywords"] - if isinstance(keywords_list, list): - return [str(k).strip() for k in keywords_list if str(k).strip()] - elif isinstance(json_data, list): - # 直接是JSON数组格式 - return [str(k).strip() for k in json_data if str(k).strip()] - except (json.JSONDecodeError, ValueError): - pass - - try: - # 尝试使用 ast.literal_eval 解析(支持Python字面量格式) - import ast - parsed = ast.literal_eval(keywords_str) - if isinstance(parsed, list): - return [str(k).strip() for k in parsed if str(k).strip()] - except (ValueError, SyntaxError): - pass - - # 尝试不同的分隔符 - separators = ['/', ',', ' ', '|', ';'] - - for separator in separators: - if separator in keywords_str: - keywords_list = [k.strip() for k in keywords_str.split(separator) if k.strip()] - if len(keywords_list) > 1: # 确保分割有效 - return keywords_list - - # 如果没有分隔符,返回单个关键词 - return [keywords_str] if keywords_str else [] \ No newline at end of file + if not content: + return "" + + original_content = content + cleaned_content = content.strip() + + # 1. 移除回复格式:[回复xxx],说:xxx(各种变体) + # 匹配所有包含"],说:"格式的回复 + cleaned_content = re.sub(r"\[回复[^\]]*\],说:\s*", "", cleaned_content) + # 匹配 [回复],说:xxx 格式 + cleaned_content = re.sub(r"\[回复<[^>]*>\],说:\s*", "", cleaned_content) + + # 2. 处理原有的[回复开头格式(保持向后兼容) + # 注意:这步要在上面处理完成后再执行,避免冲突 + if cleaned_content.startswith("[回复"): + last_bracket_index = cleaned_content.rfind("]") + if last_bracket_index != -1: + cleaned_content = cleaned_content[last_bracket_index + 1 :].strip() + + # 3. 移除表情包格式:[表情包:xxx] + cleaned_content = re.sub(r"\[表情包:[^\]]*\]", "", cleaned_content) + + # 4. 移除图片格式:[图片:xxx] + cleaned_content = re.sub(r"\[图片:[^\]]*\]", "", cleaned_content) + + # 5. 移除@格式:@ + cleaned_content = re.sub(r"@<[^>]*>", "", cleaned_content) + + # 6. 移除其他可能的系统格式 + # [表情包(描述生成失败)] 等错误格式 + cleaned_content = re.sub(r"\[表情包\([^)]*\)\]", "", cleaned_content) + # [图片(描述生成失败)] 等错误格式 + cleaned_content = re.sub(r"\[图片\([^)]*\)\]", "", cleaned_content) + + # 清理多余空格 + cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip() + + # 记录过滤操作 + if cleaned_content != original_content.strip(): + logger.info( + f"[系统格式过滤器] 检测到并清理了系统格式化文本。" + f"原始内容: '{original_content}', " + f"清理后: '{cleaned_content}'" + ) + + return cleaned_content diff --git a/src/plugins/built_in/affinity_flow_chatter/proactive/proactive_thinking_executor.py b/src/plugins/built_in/affinity_flow_chatter/proactive/proactive_thinking_executor.py index 84cf86ce5..b037428a8 100644 --- a/src/plugins/built_in/affinity_flow_chatter/proactive/proactive_thinking_executor.py +++ b/src/plugins/built_in/affinity_flow_chatter/proactive/proactive_thinking_executor.py @@ -482,7 +482,15 @@ class ProactiveThinkingPlanner: return None logger.info(f"生成回复成功: {response[:50]}...") - return response.strip() + + # 应用格式过滤器,确保回复内容不包含系统格式化文本 + from src.chat.utils.utils import filter_system_format_content + filtered_response = filter_system_format_content(response.strip()) + + if filtered_response != response.strip(): + logger.debug(f"主动思考回复已过滤系统格式: '{response.strip()}' -> '{filtered_response}'") + + return filtered_response except Exception as e: logger.error(f"生成回复失败: {e}", exc_info=True)