Files
Mofox-Core/src/plugins/utils/statistic.py

355 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import threading
import time
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Any, Dict, List
from src.common.logger import get_module_logger
from ...common.database import db
logger = get_module_logger("llm_statistics")
class LLMStatistics:
def __init__(self, output_file: str = "llm_statistics.txt"):
"""初始化LLM统计类
Args:
output_file: 统计结果输出文件路径
"""
self.output_file = output_file
self.running = False
self.stats_thread = None
self.console_thread = None
self._init_database()
self.name_dict: Dict[List] = {}
@staticmethod
def _init_database():
"""初始化数据库集合"""
if "online_time" not in db.list_collection_names():
db.create_collection("online_time")
db.online_time.create_index([("timestamp", 1)])
def start(self):
"""启动统计线程"""
if not self.running:
self.running = True
# 启动文件统计线程
self.stats_thread = threading.Thread(target=self._stats_loop)
self.stats_thread.daemon = True
self.stats_thread.start()
# 启动控制台输出线程
self.console_thread = threading.Thread(target=self._console_output_loop)
self.console_thread.daemon = True
self.console_thread.start()
def stop(self):
"""停止统计线程"""
self.running = False
if self.stats_thread:
self.stats_thread.join()
if self.console_thread:
self.console_thread.join()
@staticmethod
def _record_online_time():
"""记录在线时间"""
current_time = datetime.now()
# 检查5分钟内是否已有记录
recent_record = db.online_time.find_one({"timestamp": {"$gte": current_time - timedelta(minutes=5)}})
if not recent_record:
db.online_time.insert_one(
{
"timestamp": current_time,
"duration": 5, # 5分钟
}
)
def _collect_statistics_for_period(self, start_time: datetime) -> Dict[str, Any]:
"""收集指定时间段的LLM请求统计数据
Args:
start_time: 统计开始时间
"""
stats = {
"total_requests": 0,
"requests_by_type": defaultdict(int),
"requests_by_user": defaultdict(int),
"requests_by_model": defaultdict(int),
"average_tokens": 0,
"total_tokens": 0,
"total_cost": 0.0,
"costs_by_user": defaultdict(float),
"costs_by_type": defaultdict(float),
"costs_by_model": defaultdict(float),
# 新增token统计字段
"tokens_by_type": defaultdict(int),
"tokens_by_user": defaultdict(int),
"tokens_by_model": defaultdict(int),
# 新增在线时间统计
"online_time_minutes": 0,
# 新增消息统计字段
"total_messages": 0,
"messages_by_user": defaultdict(int),
"messages_by_chat": defaultdict(int),
}
cursor = db.llm_usage.find({"timestamp": {"$gte": start_time}})
total_requests = 0
for doc in cursor:
stats["total_requests"] += 1
request_type = doc.get("request_type", "unknown")
user_id = str(doc.get("user_id", "unknown"))
model_name = doc.get("model_name", "unknown")
stats["requests_by_type"][request_type] += 1
stats["requests_by_user"][user_id] += 1
stats["requests_by_model"][model_name] += 1
prompt_tokens = doc.get("prompt_tokens", 0)
completion_tokens = doc.get("completion_tokens", 0)
total_tokens = prompt_tokens + completion_tokens
stats["tokens_by_type"][request_type] += total_tokens
stats["tokens_by_user"][user_id] += total_tokens
stats["tokens_by_model"][model_name] += total_tokens
stats["total_tokens"] += total_tokens
cost = doc.get("cost", 0.0)
stats["total_cost"] += cost
stats["costs_by_user"][user_id] += cost
stats["costs_by_type"][request_type] += cost
stats["costs_by_model"][model_name] += cost
total_requests += 1
if total_requests > 0:
stats["average_tokens"] = stats["total_tokens"] / total_requests
# 统计在线时间
online_time_cursor = db.online_time.find({"timestamp": {"$gte": start_time}})
for doc in online_time_cursor:
stats["online_time_minutes"] += doc.get("duration", 0)
# 统计消息量
messages_cursor = db.messages.find({"time": {"$gte": start_time.timestamp()}})
for doc in messages_cursor:
stats["total_messages"] += 1
# user_id = str(doc.get("user_info", {}).get("user_id", "unknown"))
chat_info = doc.get("chat_info", {})
user_info = doc.get("user_info", {})
user_id = str(user_info.get("user_id", "unknown"))
message_time = doc.get("time", 0)
group_info = chat_info.get("group_info") if chat_info else {}
# print(f"group_info: {group_info}")
group_name = None
if group_info:
group_id = f"g{group_info.get('group_id')}"
group_name = group_info.get("group_name", f"{group_info.get('group_id')}")
if user_info and not group_name:
group_id = f"u{user_info['user_id']}"
group_name = user_info["user_nickname"]
if self.name_dict.get(group_id):
if message_time > self.name_dict.get(group_id)[1]:
self.name_dict[group_id] = [group_name, message_time]
else:
self.name_dict[group_id] = [group_name, message_time]
# print(f"group_name: {group_name}")
stats["messages_by_user"][user_id] += 1
stats["messages_by_chat"][group_id] += 1
return stats
def _collect_all_statistics(self) -> Dict[str, Dict[str, Any]]:
"""收集所有时间范围的统计数据"""
now = datetime.now()
# 使用2000年1月1日作为"所有时间"的起始时间,这是一个更合理的起始点
all_time_start = datetime(2000, 1, 1)
return {
"all_time": self._collect_statistics_for_period(all_time_start),
"last_7_days": self._collect_statistics_for_period(now - timedelta(days=7)),
"last_24_hours": self._collect_statistics_for_period(now - timedelta(days=1)),
"last_hour": self._collect_statistics_for_period(now - timedelta(hours=1)),
}
def _format_stats_section(self, stats: Dict[str, Any], title: str) -> str:
"""格式化统计部分的输出"""
output = ["\n" + "-" * 84, f"{title}", "-" * 84, f"总请求数: {stats['total_requests']}"]
if stats["total_requests"] > 0:
output.append(f"总Token数: {stats['total_tokens']}")
output.append(f"总花费: {stats['total_cost']:.4f}¥")
output.append(f"在线时间: {stats['online_time_minutes']}分钟")
output.append(f"总消息数: {stats['total_messages']}\n")
data_fmt = "{:<32} {:>10} {:>14} {:>13.4f} ¥"
# 按模型统计
output.append("按模型统计:")
output.append("模型名称 调用次数 Token总量 累计花费")
for model_name, count in sorted(stats["requests_by_model"].items()):
tokens = stats["tokens_by_model"][model_name]
cost = stats["costs_by_model"][model_name]
output.append(
data_fmt.format(model_name[:30] + ".." if len(model_name) > 32 else model_name, count, tokens, cost)
)
output.append("")
# 按请求类型统计
output.append("按请求类型统计:")
output.append("模型名称 调用次数 Token总量 累计花费")
for req_type, count in sorted(stats["requests_by_type"].items()):
tokens = stats["tokens_by_type"][req_type]
cost = stats["costs_by_type"][req_type]
output.append(
data_fmt.format(req_type[:22] + ".." if len(req_type) > 24 else req_type, count, tokens, cost)
)
output.append("")
# 修正用户统计列宽
output.append("按用户统计:")
output.append("用户ID 调用次数 Token总量 累计花费")
for user_id, count in sorted(stats["requests_by_user"].items()):
tokens = stats["tokens_by_user"][user_id]
cost = stats["costs_by_user"][user_id]
output.append(
data_fmt.format(
user_id[:22], # 不再添加省略号保持原始ID
count,
tokens,
cost,
)
)
output.append("")
# 添加聊天统计
output.append("群组统计:")
output.append("群组名称 消息数量")
for group_id, count in sorted(stats["messages_by_chat"].items()):
output.append(f"{self.name_dict[group_id][0][:32]:<32} {count:>10}")
return "\n".join(output)
def _format_stats_section_lite(self, stats: Dict[str, Any], title: str) -> str:
"""格式化统计部分的输出"""
output = ["\n" + "-" * 84, f"{title}", "-" * 84]
# output.append(f"总请求数: {stats['total_requests']}")
if stats["total_requests"] > 0:
# output.append(f"总Token数: {stats['total_tokens']}")
output.append(f"总花费: {stats['total_cost']:.4f}¥")
# output.append(f"在线时间: {stats['online_time_minutes']}分钟")
output.append(f"总消息数: {stats['total_messages']}\n")
data_fmt = "{:<32} {:>10} {:>14} {:>13.4f} ¥"
# 按模型统计
output.append("按模型统计:")
output.append("模型名称 调用次数 Token总量 累计花费")
for model_name, count in sorted(stats["requests_by_model"].items()):
tokens = stats["tokens_by_model"][model_name]
cost = stats["costs_by_model"][model_name]
output.append(
data_fmt.format(model_name[:30] + ".." if len(model_name) > 32 else model_name, count, tokens, cost)
)
output.append("")
# 按请求类型统计
# output.append("按请求类型统计:")
# output.append(("模型名称 调用次数 Token总量 累计花费"))
# for req_type, count in sorted(stats["requests_by_type"].items()):
# tokens = stats["tokens_by_type"][req_type]
# cost = stats["costs_by_type"][req_type]
# output.append(
# data_fmt.format(req_type[:22] + ".." if len(req_type) > 24 else req_type, count, tokens, cost)
# )
# output.append("")
# 修正用户统计列宽
# output.append("按用户统计:")
# output.append(("用户ID 调用次数 Token总量 累计花费"))
# for user_id, count in sorted(stats["requests_by_user"].items()):
# tokens = stats["tokens_by_user"][user_id]
# cost = stats["costs_by_user"][user_id]
# output.append(
# data_fmt.format(
# user_id[:22], # 不再添加省略号保持原始ID
# count,
# tokens,
# cost,
# )
# )
# output.append("")
# 添加聊天统计
output.append("群组统计:")
output.append("群组名称 消息数量")
for group_id, count in sorted(stats["messages_by_chat"].items()):
output.append(f"{self.name_dict[group_id][0][:32]:<32} {count:>10}")
return "\n".join(output)
def _save_statistics(self, all_stats: Dict[str, Dict[str, Any]]):
"""将统计结果保存到文件"""
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
output = [f"LLM请求统计报告 (生成时间: {current_time})"]
# 添加各个时间段的统计
sections = [
("所有时间统计", "all_time"),
("最近7天统计", "last_7_days"),
("最近24小时统计", "last_24_hours"),
("最近1小时统计", "last_hour"),
]
for title, key in sections:
output.append(self._format_stats_section(all_stats[key], title))
# 写入文件
with open(self.output_file, "w", encoding="utf-8") as f:
f.write("\n".join(output))
def _console_output_loop(self):
"""控制台输出循环每5分钟输出一次最近1小时的统计"""
while self.running:
# 等待5分钟
for _ in range(300): # 5分钟 = 300秒
if not self.running:
break
time.sleep(1)
try:
# 收集最近1小时的统计数据
now = datetime.now()
hour_stats = self._collect_statistics_for_period(now - timedelta(hours=1))
# 使用logger输出
stats_output = self._format_stats_section_lite(
hour_stats, "最近1小时统计详细信息见根目录文件llm_statistics.txt"
)
logger.info("\n" + stats_output + "\n" + "=" * 50)
except Exception:
logger.exception("控制台统计数据输出失败")
def _stats_loop(self):
"""统计循环每5分钟运行一次"""
while self.running:
try:
# 记录在线时间
self._record_online_time()
# 收集并保存统计数据
all_stats = self._collect_all_statistics()
self._save_statistics(all_stats)
except Exception:
logger.exception("统计数据处理失败")
# 等待5分钟
for _ in range(300): # 5分钟 = 300秒
if not self.running:
break
time.sleep(1)