feat:每个消息段有10%概率丢弃

This commit is contained in:
SengokuCola
2025-07-08 17:30:03 +08:00
parent 6782925a1f
commit 4c18426b23

View File

@@ -2,6 +2,7 @@ import time
import traceback
import os
import pickle
import random
from typing import List, Dict
from src.config.config import global_config
from src.common.logger import get_logger
@@ -415,11 +416,28 @@ class RelationshipBuilder:
async def update_impression_on_segments(self, person_id: str, chat_id: str, segments: List[Dict[str, any]]):
"""基于消息段更新用户印象"""
logger.debug(f"开始为 {person_id} 基于 {len(segments)} 个消息段更新印象")
original_segment_count = len(segments)
logger.debug(f"开始为 {person_id} 基于 {original_segment_count} 个消息段更新印象")
try:
# 筛选要处理的消息段每个消息段有10%的概率被丢弃
segments_to_process = [s for s in segments if random.random() >= 0.1]
# 如果所有消息段都被丢弃,但原来有消息段,则至少保留一个(最新的)
if not segments_to_process and segments:
segments.sort(key=lambda x: x["end_time"], reverse=True)
segments_to_process.append(segments[0])
logger.debug(f"随机丢弃了所有消息段,强制保留最新的一个以进行处理。")
dropped_count = original_segment_count - len(segments_to_process)
if dropped_count > 0:
logger.info(f"{person_id} 随机丢弃了 {dropped_count} / {original_segment_count} 个消息段")
processed_messages = []
for i, segment in enumerate(segments):
# 对筛选后的消息段进行排序,确保时间顺序
segments_to_process.sort(key=lambda x: x["start_time"])
for segment in segments_to_process:
start_time = segment["start_time"]
end_time = segment["end_time"]
start_date = time.strftime("%Y-%m-%d %H:%M", time.localtime(start_time))
@@ -427,12 +445,12 @@ class RelationshipBuilder:
# 获取该段的消息(包含边界)
segment_messages = get_raw_msg_by_timestamp_with_chat_inclusive(self.chat_id, start_time, end_time)
logger.debug(
f"消息段 {i + 1}: {start_date} - {time.strftime('%Y-%m-%d %H:%M', time.localtime(end_time))}, 消息数: {len(segment_messages)}"
f"消息段: {start_date} - {time.strftime('%Y-%m-%d %H:%M', time.localtime(end_time))}, 消息数: {len(segment_messages)}"
)
if segment_messages:
# 如果不是第一个消息段,在消息列表前添加间隔标识
if i > 0:
# 如果 processed_messages 不为空,说明这不是第一个被处理的消息段,在消息列表前添加间隔标识
if processed_messages:
# 创建一个特殊的间隔消息
gap_message = {
"time": start_time - 0.1, # 稍微早于段开始时间