From 16afd8f6ff374dc6ed329937317867d4b3ffff81 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Wed, 3 Dec 2025 13:13:41 +0800 Subject: [PATCH] =?UTF-8?q?feat(filter):=20=E6=B7=BB=E5=8A=A0=E6=97=A0?= =?UTF-8?q?=E6=84=8F=E4=B9=89=E6=B6=88=E6=81=AF=E8=BF=87=E6=BB=A4=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E4=BB=A5=E4=BC=98=E5=8C=96=E8=A1=A8=E8=BE=BE=E5=AD=A6?= =?UTF-8?q?=E4=B9=A0=E6=95=88=E6=9E=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/express/expression_learner.py | 11 ++++- src/chat/utils/chat_message_builder.py | 65 +++++++++++++++++++++++++- src/common/message_repository.py | 10 ++++ 3 files changed, 82 insertions(+), 4 deletions(-) diff --git a/src/chat/express/expression_learner.py b/src/chat/express/expression_learner.py index c1c7dd1e7..6343badb5 100644 --- a/src/chat/express/expression_learner.py +++ b/src/chat/express/expression_learner.py @@ -670,13 +670,14 @@ class ExpressionLearner: current_time = time.time() - # 获取上次学习时间,过滤掉机器人自己的消息 + # 获取上次学习时间,过滤掉机器人自己的消息和无意义消息 random_msg: list[dict[str, Any]] | None = await get_raw_msg_by_timestamp_with_chat_inclusive( chat_id=self.chat_id, timestamp_start=self.last_learning_time, timestamp_end=current_time, limit=num, filter_bot=True, # 过滤掉机器人自己的消息,防止学习自己的表达方式 + filter_meaningless=True, # 🔥 过滤掉表情包、通知等无意义消息 ) # print(random_msg) @@ -685,8 +686,14 @@ class ExpressionLearner: # 转化成str chat_id: str = random_msg[0]["chat_id"] # random_msg_str: str = build_readable_messages(random_msg, timestamp_mode="normal") - random_msg_str: str = await build_anonymous_messages(random_msg) + # 🔥 启用表达学习场景的过滤,过滤掉纯回复、纯@、纯图片等无意义内容 + random_msg_str: str = await build_anonymous_messages(random_msg, filter_for_learning=True) # print(f"random_msg_str:{random_msg_str}") + + # 🔥 检查过滤后是否还有足够的内容 + if not random_msg_str or len(random_msg_str.strip()) < 20: + logger.debug(f"过滤后消息内容不足,跳过本次{type_str}学习") + return None prompt: str = await global_prompt_manager.format_prompt( prompt, diff --git a/src/chat/utils/chat_message_builder.py b/src/chat/utils/chat_message_builder.py index 7cd4e0596..79883fe7b 100644 --- a/src/chat/utils/chat_message_builder.py +++ b/src/chat/utils/chat_message_builder.py @@ -224,10 +224,12 @@ async def get_raw_msg_by_timestamp_with_chat_inclusive( limit: int = 0, limit_mode: str = "latest", filter_bot=False, + filter_meaningless=False, ) -> list[dict[str, Any]]: """获取在特定聊天从指定时间戳到指定时间戳的消息(包含边界),按时间升序排序,返回消息列表 limit: 限制返回的消息数量,0为不限制 limit_mode: 当 limit > 0 时生效。 'earliest' 表示获取最早的记录, 'latest' 表示获取最新的记录。默认为 'latest'。 + filter_meaningless: 是否过滤无意义消息(表情包、通知等)。用于表达学习等场景。 """ filter_query = {"chat_id": chat_id, "time": {"$gte": timestamp_start, "$lte": timestamp_end}} # 只有当 limit 为 0 时才应用外部 sort @@ -235,7 +237,12 @@ async def get_raw_msg_by_timestamp_with_chat_inclusive( # 直接将 limit_mode 传递给 find_messages return await find_messages( - message_filter=filter_query, sort=sort_order, limit=limit, limit_mode=limit_mode, filter_bot=filter_bot + message_filter=filter_query, + sort=sort_order, + limit=limit, + limit_mode=limit_mode, + filter_bot=filter_bot, + filter_meaningless=filter_meaningless, ) @@ -1114,10 +1121,14 @@ async def build_readable_messages( return "".join(result_parts) -async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str: +async def build_anonymous_messages(messages: list[dict[str, Any]], filter_for_learning: bool = False) -> str: """ 构建匿名可读消息,将不同人的名称转为唯一占位符(A、B、C...),bot自己用SELF。 处理 回复 和 @ 字段,将bbb映射为匿名占位符。 + + Args: + messages: 消息列表 + filter_for_learning: 是否为表达学习场景进行额外过滤(过滤掉纯回复、纯@、纯图片等无意义内容) """ assert global_config is not None if not messages: @@ -1151,6 +1162,52 @@ async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str: person_map[person_id] = chr(current_char) current_char += 1 return person_map[person_id] + + def is_meaningless_content(content: str, msg: dict) -> bool: + """ + 判断消息内容是否无意义(用于表达学习过滤) + """ + if not content or not content.strip(): + return True + + stripped = content.strip() + + # 检查消息标记字段 + if msg.get("is_emoji", False): + return True + if msg.get("is_notify", False): + return True + if msg.get("is_public_notice", False): + return True + if msg.get("is_command", False): + return True + + # 🔥 检查纯回复消息(只有[回复]没有其他内容) + reply_pattern = r"^\s*\[回复[^\]]*\]\s*$" + if re.match(reply_pattern, stripped): + return True + + # 🔥 检查纯@消息(只有@xxx没有其他内容) + at_pattern = r"^\s*(@[^\s]+\s*)+$" + if re.match(at_pattern, stripped): + return True + + # 🔥 检查纯图片消息 + image_pattern = r"^\s*(\[图片\]|\[动画表情\]|\[表情\]|\[picid:[^\]]+\])\s*$" + if re.match(image_pattern, stripped): + return True + + # 🔥 移除回复标记、@标记、图片标记后检查是否还有实质内容 + clean_content = re.sub(r"\[回复[^\]]*\]", "", stripped) + clean_content = re.sub(r"@[^\s]+", "", clean_content) + clean_content = re.sub(r"\[图片\]|\[动画表情\]|\[表情\]|\[picid:[^\]]+\]", "", clean_content) + clean_content = clean_content.strip() + + # 如果移除后内容太短(少于2个字符),认为无意义 + if len(clean_content) < 2: + return True + + return False for msg in messages: try: @@ -1170,6 +1227,10 @@ async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str: # For anonymous messages, we just replace with a placeholder. content = re.sub(r"\[picid:([^\]]+)\]", "[图片]", content) + + # 🔥 表达学习场景:过滤无意义消息 + if filter_for_learning and is_meaningless_content(content, msg): + continue # if not all([platform, user_id, timestamp is not None]): # continue diff --git a/src/common/message_repository.py b/src/common/message_repository.py index b74b76f20..392fd001d 100644 --- a/src/common/message_repository.py +++ b/src/common/message_repository.py @@ -38,6 +38,7 @@ async def find_messages( limit_mode: str = "latest", filter_bot=False, filter_command=False, + filter_meaningless=False, ) -> list[dict[str, Any]]: """ 根据提供的过滤器、排序和限制条件查找消息。 @@ -47,6 +48,7 @@ async def find_messages( sort: 排序条件列表,例如 [('time', 1)] (1 for asc, -1 for desc)。仅在 limit 为 0 时生效。 limit: 返回的最大文档数,0表示不限制。 limit_mode: 当 limit > 0 时生效。 'earliest' 表示获取最早的记录, 'latest' 表示获取最新的记录(结果仍按时间正序排列)。默认为 'latest'。 + filter_meaningless: 是否过滤无意义消息(表情包、通知、纯回复等)。用于表达学习等场景。 Returns: 消息字典列表,如果出错则返回空列表。 @@ -95,6 +97,14 @@ async def find_messages( if filter_command: query = query.where(not_(Messages.is_command)) + # 🔥 过滤无意义消息(用于表达学习等场景) + if filter_meaningless: + # 排除:纯表情包、通知消息、公告消息、命令消息 + query = query.where(not_(Messages.is_emoji)) + query = query.where(not_(Messages.is_notify)) + query = query.where(not_(Messages.is_public_notice)) + query = query.where(not_(Messages.is_command)) + if limit > 0: # 确保limit是正整数 limit = max(1, int(limit))