From 16afd8f6ff374dc6ed329937317867d4b3ffff81 Mon Sep 17 00:00:00 2001
From: Windpicker-owo <3431391539@qq.com>
Date: Wed, 3 Dec 2025 13:13:41 +0800
Subject: [PATCH] =?UTF-8?q?feat(filter):=20=E6=B7=BB=E5=8A=A0=E6=97=A0?=
 =?UTF-8?q?=E6=84=8F=E4=B9=89=E6=B6=88=E6=81=AF=E8=BF=87=E6=BB=A4=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD=E4=BB=A5=E4=BC=98=E5=8C=96=E8=A1=A8=E8=BE=BE=E5=AD=A6?=
 =?UTF-8?q?=E4=B9=A0=E6=95=88=E6=9E=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/chat/express/expression_learner.py | 11 ++++-
 src/chat/utils/chat_message_builder.py | 65 +++++++++++++++++++++++++-
 src/common/message_repository.py       | 10 ++++
 3 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/src/chat/express/expression_learner.py b/src/chat/express/expression_learner.py
index c1c7dd1e7..6343badb5 100644
--- a/src/chat/express/expression_learner.py
+++ b/src/chat/express/expression_learner.py
@@ -670,13 +670,14 @@ class ExpressionLearner:
 
         current_time = time.time()
 
-        # 获取上次学习时间，过滤掉机器人自己的消息
+        # 获取上次学习时间，过滤掉机器人自己的消息和无意义消息
         random_msg: list[dict[str, Any]] | None = await get_raw_msg_by_timestamp_with_chat_inclusive(
             chat_id=self.chat_id,
             timestamp_start=self.last_learning_time,
             timestamp_end=current_time,
             limit=num,
             filter_bot=True,  # 过滤掉机器人自己的消息，防止学习自己的表达方式
+            filter_meaningless=True,  # 🔥 过滤掉表情包、通知等无意义消息
         )
 
         # print(random_msg)
@@ -685,8 +686,14 @@ class ExpressionLearner:
         # 转化成str
         chat_id: str = random_msg[0]["chat_id"]
         # random_msg_str: str = build_readable_messages(random_msg, timestamp_mode="normal")
-        random_msg_str: str = await build_anonymous_messages(random_msg)
+        # 🔥 启用表达学习场景的过滤，过滤掉纯回复、纯@、纯图片等无意义内容
+        random_msg_str: str = await build_anonymous_messages(random_msg, filter_for_learning=True)
         # print(f"random_msg_str:{random_msg_str}")
+        
+        # 🔥 检查过滤后是否还有足够的内容
+        if not random_msg_str or len(random_msg_str.strip()) < 20:
+            logger.debug(f"过滤后消息内容不足，跳过本次{type_str}学习")
+            return None
 
         prompt: str = await global_prompt_manager.format_prompt(
             prompt,
diff --git a/src/chat/utils/chat_message_builder.py b/src/chat/utils/chat_message_builder.py
index 7cd4e0596..79883fe7b 100644
--- a/src/chat/utils/chat_message_builder.py
+++ b/src/chat/utils/chat_message_builder.py
@@ -224,10 +224,12 @@ async def get_raw_msg_by_timestamp_with_chat_inclusive(
     limit: int = 0,
     limit_mode: str = "latest",
     filter_bot=False,
+    filter_meaningless=False,
 ) -> list[dict[str, Any]]:
     """获取在特定聊天从指定时间戳到指定时间戳的消息（包含边界），按时间升序排序，返回消息列表
     limit: 限制返回的消息数量，0为不限制
     limit_mode: 当 limit > 0 时生效。 'earliest' 表示获取最早的记录， 'latest' 表示获取最新的记录。默认为 'latest'。
+    filter_meaningless: 是否过滤无意义消息（表情包、通知等）。用于表达学习等场景。
     """
     filter_query = {"chat_id": chat_id, "time": {"$gte": timestamp_start, "$lte": timestamp_end}}
     # 只有当 limit 为 0 时才应用外部 sort
@@ -235,7 +237,12 @@ async def get_raw_msg_by_timestamp_with_chat_inclusive(
     # 直接将 limit_mode 传递给 find_messages
 
     return await find_messages(
-        message_filter=filter_query, sort=sort_order, limit=limit, limit_mode=limit_mode, filter_bot=filter_bot
+        message_filter=filter_query,
+        sort=sort_order,
+        limit=limit,
+        limit_mode=limit_mode,
+        filter_bot=filter_bot,
+        filter_meaningless=filter_meaningless,
     )
 
 
@@ -1114,10 +1121,14 @@ async def build_readable_messages(
         return "".join(result_parts)
 
 
-async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str:
+async def build_anonymous_messages(messages: list[dict[str, Any]], filter_for_learning: bool = False) -> str:
     """
     构建匿名可读消息，将不同人的名称转为唯一占位符（A、B、C...），bot自己用SELF。
     处理 回复<aaa:bbb> 和 @<aaa:bbb> 字段，将bbb映射为匿名占位符。
+    
+    Args:
+        messages: 消息列表
+        filter_for_learning: 是否为表达学习场景进行额外过滤（过滤掉纯回复、纯@、纯图片等无意义内容）
     """
     assert global_config is not None
     if not messages:
@@ -1151,6 +1162,52 @@ async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str:
             person_map[person_id] = chr(current_char)
             current_char += 1
         return person_map[person_id]
+    
+    def is_meaningless_content(content: str, msg: dict) -> bool:
+        """
+        判断消息内容是否无意义（用于表达学习过滤）
+        """
+        if not content or not content.strip():
+            return True
+        
+        stripped = content.strip()
+        
+        # 检查消息标记字段
+        if msg.get("is_emoji", False):
+            return True
+        if msg.get("is_notify", False):
+            return True
+        if msg.get("is_public_notice", False):
+            return True
+        if msg.get("is_command", False):
+            return True
+        
+        # 🔥 检查纯回复消息（只有[回复<xxx>]没有其他内容）
+        reply_pattern = r"^\s*\[回复[^\]]*\]\s*$"
+        if re.match(reply_pattern, stripped):
+            return True
+        
+        # 🔥 检查纯@消息（只有@xxx没有其他内容）
+        at_pattern = r"^\s*(@[^\s]+\s*)+$"
+        if re.match(at_pattern, stripped):
+            return True
+        
+        # 🔥 检查纯图片消息
+        image_pattern = r"^\s*(\[图片\]|\[动画表情\]|\[表情\]|\[picid:[^\]]+\])\s*$"
+        if re.match(image_pattern, stripped):
+            return True
+        
+        # 🔥 移除回复标记、@标记、图片标记后检查是否还有实质内容
+        clean_content = re.sub(r"\[回复[^\]]*\]", "", stripped)
+        clean_content = re.sub(r"@[^\s]+", "", clean_content)
+        clean_content = re.sub(r"\[图片\]|\[动画表情\]|\[表情\]|\[picid:[^\]]+\]", "", clean_content)
+        clean_content = clean_content.strip()
+        
+        # 如果移除后内容太短（少于2个字符），认为无意义
+        if len(clean_content) < 2:
+            return True
+        
+        return False
 
     for msg in messages:
         try:
@@ -1170,6 +1227,10 @@ async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str:
 
             # For anonymous messages, we just replace with a placeholder.
             content = re.sub(r"\[picid:([^\]]+)\]", "[图片]", content)
+            
+            # 🔥 表达学习场景：过滤无意义消息
+            if filter_for_learning and is_meaningless_content(content, msg):
+                continue
 
             # if not all([platform, user_id, timestamp is not None]):
             # continue
diff --git a/src/common/message_repository.py b/src/common/message_repository.py
index b74b76f20..392fd001d 100644
--- a/src/common/message_repository.py
+++ b/src/common/message_repository.py
@@ -38,6 +38,7 @@ async def find_messages(
     limit_mode: str = "latest",
     filter_bot=False,
     filter_command=False,
+    filter_meaningless=False,
 ) -> list[dict[str, Any]]:
     """
     根据提供的过滤器、排序和限制条件查找消息。
@@ -47,6 +48,7 @@ async def find_messages(
         sort: 排序条件列表，例如 [('time', 1)] (1 for asc, -1 for desc)。仅在 limit 为 0 时生效。
         limit: 返回的最大文档数，0表示不限制。
         limit_mode: 当 limit > 0 时生效。 'earliest' 表示获取最早的记录， 'latest' 表示获取最新的记录（结果仍按时间正序排列）。默认为 'latest'。
+        filter_meaningless: 是否过滤无意义消息（表情包、通知、纯回复等）。用于表达学习等场景。
 
     Returns:
         消息字典列表，如果出错则返回空列表。
@@ -95,6 +97,14 @@ async def find_messages(
             if filter_command:
                 query = query.where(not_(Messages.is_command))
 
+            # 🔥 过滤无意义消息（用于表达学习等场景）
+            if filter_meaningless:
+                # 排除：纯表情包、通知消息、公告消息、命令消息
+                query = query.where(not_(Messages.is_emoji))
+                query = query.where(not_(Messages.is_notify))
+                query = query.where(not_(Messages.is_public_notice))
+                query = query.where(not_(Messages.is_command))
+
             if limit > 0:
                 # 确保limit是正整数
                 limit = max(1, int(limit))