better:回复现在依照条数和长度而不是时间引用

This commit is contained in:
SengokuCola
2025-04-04 00:30:27 +08:00
parent e717bc34b0
commit a92aa35e72
7 changed files with 177 additions and 56 deletions

View File

@@ -487,3 +487,108 @@ def is_western_char(char):
def is_western_paragraph(paragraph):
"""检测是否为西文字符段落"""
return all(is_western_char(char) for char in paragraph if char.isalnum())
def count_messages_between(start_time: float, end_time: float, stream_id: str) -> tuple[int, int]:
"""计算两个时间点之间的消息数量和文本总长度
Args:
start_time (float): 起始时间戳
end_time (float): 结束时间戳
stream_id (str): 聊天流ID
Returns:
tuple[int, int]: (消息数量, 文本总长度)
- 消息数量:包含起始时间的消息,不包含结束时间的消息
- 文本总长度所有消息的processed_plain_text长度之和
"""
try:
# 获取开始时间之前最新的一条消息
start_message = db.messages.find_one(
{
"chat_id": stream_id,
"time": {"$lte": start_time}
},
sort=[("time", -1), ("_id", -1)] # 按时间倒序_id倒序最后插入的在前
)
# 获取结束时间最近的一条消息
# 先找到结束时间点的所有消息
end_time_messages = list(db.messages.find(
{
"chat_id": stream_id,
"time": {"$lte": end_time}
},
sort=[("time", -1)] # 先按时间倒序
).limit(10)) # 限制查询数量,避免性能问题
if not end_time_messages:
logger.warning(f"未找到结束时间 {end_time} 之前的消息")
return 0, 0
# 找到最大时间
max_time = end_time_messages[0]["time"]
# 在最大时间的消息中找最后插入的_id最大的
end_message = max(
[msg for msg in end_time_messages if msg["time"] == max_time],
key=lambda x: x["_id"]
)
if not start_message:
logger.warning(f"未找到开始时间 {start_time} 之前的消息")
return 0, 0
# 调试输出
# print("\n=== 消息范围信息 ===")
# print("Start message:", {
# "message_id": start_message.get("message_id"),
# "time": start_message.get("time"),
# "text": start_message.get("processed_plain_text", ""),
# "_id": str(start_message.get("_id"))
# })
# print("End message:", {
# "message_id": end_message.get("message_id"),
# "time": end_message.get("time"),
# "text": end_message.get("processed_plain_text", ""),
# "_id": str(end_message.get("_id"))
# })
# print("Stream ID:", stream_id)
# 如果结束消息的时间等于开始时间返回0
if end_message["time"] == start_message["time"]:
return 0, 0
# 获取并打印这个时间范围内的所有消息
# print("\n=== 时间范围内的所有消息 ===")
all_messages = list(db.messages.find(
{
"chat_id": stream_id,
"time": {
"$gte": start_message["time"],
"$lte": end_message["time"]
}
},
sort=[("time", 1), ("_id", 1)] # 按时间正序_id正序
))
count = 0
total_length = 0
for msg in all_messages:
count += 1
text_length = len(msg.get("processed_plain_text", ""))
total_length += text_length
# print(f"\n消息 {count}:")
# print({
# "message_id": msg.get("message_id"),
# "time": msg.get("time"),
# "text": msg.get("processed_plain_text", ""),
# "text_length": text_length,
# "_id": str(msg.get("_id"))
# })
# 如果时间不同需要把end_message本身也计入
return count - 1, total_length
except Exception as e:
logger.error(f"计算消息数量时出错: {str(e)}")
return 0, 0