Merge branch 'dev' of https://github.com/MoFox-Studio/MoFox-Core into dev
This commit is contained in:
@@ -224,10 +224,12 @@ async def get_raw_msg_by_timestamp_with_chat_inclusive(
|
||||
limit: int = 0,
|
||||
limit_mode: str = "latest",
|
||||
filter_bot=False,
|
||||
filter_meaningless=False,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""获取在特定聊天从指定时间戳到指定时间戳的消息(包含边界),按时间升序排序,返回消息列表
|
||||
limit: 限制返回的消息数量,0为不限制
|
||||
limit_mode: 当 limit > 0 时生效。 'earliest' 表示获取最早的记录, 'latest' 表示获取最新的记录。默认为 'latest'。
|
||||
filter_meaningless: 是否过滤无意义消息(表情包、通知等)。用于表达学习等场景。
|
||||
"""
|
||||
filter_query = {"chat_id": chat_id, "time": {"$gte": timestamp_start, "$lte": timestamp_end}}
|
||||
# 只有当 limit 为 0 时才应用外部 sort
|
||||
@@ -235,7 +237,12 @@ async def get_raw_msg_by_timestamp_with_chat_inclusive(
|
||||
# 直接将 limit_mode 传递给 find_messages
|
||||
|
||||
return await find_messages(
|
||||
message_filter=filter_query, sort=sort_order, limit=limit, limit_mode=limit_mode, filter_bot=filter_bot
|
||||
message_filter=filter_query,
|
||||
sort=sort_order,
|
||||
limit=limit,
|
||||
limit_mode=limit_mode,
|
||||
filter_bot=filter_bot,
|
||||
filter_meaningless=filter_meaningless,
|
||||
)
|
||||
|
||||
|
||||
@@ -1114,10 +1121,14 @@ async def build_readable_messages(
|
||||
return "".join(result_parts)
|
||||
|
||||
|
||||
async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str:
|
||||
async def build_anonymous_messages(messages: list[dict[str, Any]], filter_for_learning: bool = False) -> str:
|
||||
"""
|
||||
构建匿名可读消息,将不同人的名称转为唯一占位符(A、B、C...),bot自己用SELF。
|
||||
处理 回复<aaa:bbb> 和 @<aaa:bbb> 字段,将bbb映射为匿名占位符。
|
||||
|
||||
Args:
|
||||
messages: 消息列表
|
||||
filter_for_learning: 是否为表达学习场景进行额外过滤(过滤掉纯回复、纯@、纯图片等无意义内容)
|
||||
"""
|
||||
assert global_config is not None
|
||||
if not messages:
|
||||
@@ -1151,6 +1162,52 @@ async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str:
|
||||
person_map[person_id] = chr(current_char)
|
||||
current_char += 1
|
||||
return person_map[person_id]
|
||||
|
||||
def is_meaningless_content(content: str, msg: dict) -> bool:
|
||||
"""
|
||||
判断消息内容是否无意义(用于表达学习过滤)
|
||||
"""
|
||||
if not content or not content.strip():
|
||||
return True
|
||||
|
||||
stripped = content.strip()
|
||||
|
||||
# 检查消息标记字段
|
||||
if msg.get("is_emoji", False):
|
||||
return True
|
||||
if msg.get("is_notify", False):
|
||||
return True
|
||||
if msg.get("is_public_notice", False):
|
||||
return True
|
||||
if msg.get("is_command", False):
|
||||
return True
|
||||
|
||||
# 🔥 检查纯回复消息(只有[回复<xxx>]没有其他内容)
|
||||
reply_pattern = r"^\s*\[回复[^\]]*\]\s*$"
|
||||
if re.match(reply_pattern, stripped):
|
||||
return True
|
||||
|
||||
# 🔥 检查纯@消息(只有@xxx没有其他内容)
|
||||
at_pattern = r"^\s*(@[^\s]+\s*)+$"
|
||||
if re.match(at_pattern, stripped):
|
||||
return True
|
||||
|
||||
# 🔥 检查纯图片消息
|
||||
image_pattern = r"^\s*(\[图片\]|\[动画表情\]|\[表情\]|\[picid:[^\]]+\])\s*$"
|
||||
if re.match(image_pattern, stripped):
|
||||
return True
|
||||
|
||||
# 🔥 移除回复标记、@标记、图片标记后检查是否还有实质内容
|
||||
clean_content = re.sub(r"\[回复[^\]]*\]", "", stripped)
|
||||
clean_content = re.sub(r"@[^\s]+", "", clean_content)
|
||||
clean_content = re.sub(r"\[图片\]|\[动画表情\]|\[表情\]|\[picid:[^\]]+\]", "", clean_content)
|
||||
clean_content = clean_content.strip()
|
||||
|
||||
# 如果移除后内容太短(少于2个字符),认为无意义
|
||||
if len(clean_content) < 2:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
for msg in messages:
|
||||
try:
|
||||
@@ -1170,6 +1227,10 @@ async def build_anonymous_messages(messages: list[dict[str, Any]]) -> str:
|
||||
|
||||
# For anonymous messages, we just replace with a placeholder.
|
||||
content = re.sub(r"\[picid:([^\]]+)\]", "[图片]", content)
|
||||
|
||||
# 🔥 表达学习场景:过滤无意义消息
|
||||
if filter_for_learning and is_meaningless_content(content, msg):
|
||||
continue
|
||||
|
||||
# if not all([platform, user_id, timestamp is not None]):
|
||||
# continue
|
||||
|
||||
@@ -168,15 +168,22 @@ class ImageManager:
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()
|
||||
|
||||
# 如果缓存命中,可以提前释放 image_bytes
|
||||
# 但如果需要保存表情包,则需要保留 image_bytes
|
||||
|
||||
# 2. 优先查询已注册表情的缓存(Emoji表)
|
||||
if full_description := await emoji_manager.get_emoji_description_by_hash(image_hash):
|
||||
logger.info("[缓存命中] 使用已注册表情包(Emoji表)的完整描述")
|
||||
del image_bytes # 缓存命中,不再需要
|
||||
del image_base64
|
||||
refined_part = full_description.split(" Keywords:")[0]
|
||||
return f"[表情包:{refined_part}]"
|
||||
|
||||
# 3. 查询通用图片描述缓存(ImageDescriptions表)
|
||||
if cached_description := await self._get_description_from_db(image_hash, "emoji"):
|
||||
logger.info("[缓存命中] 使用通用图片缓存(ImageDescriptions表)中的描述")
|
||||
del image_bytes # 缓存命中,不再需要
|
||||
del image_base64
|
||||
refined_part = cached_description.split(" Keywords:")[0]
|
||||
return f"[表情包:{refined_part}]"
|
||||
|
||||
@@ -209,7 +216,11 @@ class ImageManager:
|
||||
await self._save_description_to_db(image_hash, full_description, "emoji")
|
||||
logger.info(f"新生成的表情包描述已存入通用缓存 (Hash: {image_hash[:8]}...)")
|
||||
|
||||
# 6. 返回新生成的描述中用于显示的“精炼描述”部分
|
||||
# 内存优化:处理完成后主动释放大型二进制数据
|
||||
del image_bytes
|
||||
del image_base64
|
||||
|
||||
# 6. 返回新生成的描述中用于显示的"精炼描述"部分
|
||||
refined_part = full_description.split(" Keywords:")[0]
|
||||
return f"[表情包:{refined_part}]"
|
||||
|
||||
@@ -248,11 +259,17 @@ class ImageManager:
|
||||
existing_image = result.scalar()
|
||||
if existing_image and existing_image.description:
|
||||
logger.debug(f"[缓存命中] 使用Images表中的图片描述: {existing_image.description[:50]}...")
|
||||
# 缓存命中,释放 base64 和 image_bytes
|
||||
del image_bytes
|
||||
del image_base64
|
||||
return f"[图片:{existing_image.description}]"
|
||||
|
||||
# 3. 其次查询 ImageDescriptions 表缓存
|
||||
if cached_description := await self._get_description_from_db(image_hash, "image"):
|
||||
logger.debug(f"[缓存命中] 使用ImageDescriptions表中的描述: {cached_description[:50]}...")
|
||||
# 缓存命中,释放 base64 和 image_bytes
|
||||
del image_bytes
|
||||
del image_base64
|
||||
return f"[图片:{cached_description}]"
|
||||
|
||||
# 4. 如果都未命中,则同步调用VLM生成新描述
|
||||
@@ -301,6 +318,10 @@ class ImageManager:
|
||||
|
||||
logger.info(f"新生成的图片描述已存入缓存 (Hash: {image_hash[:8]}...)")
|
||||
|
||||
# 内存优化:处理完成后主动释放大型二进制数据
|
||||
del image_bytes
|
||||
del image_base64
|
||||
|
||||
return f"[图片:{description}]"
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user