Files
Mofox-Core/src/common/database/utils/monitoring.py
LuiKlee d6ba543b24 实现慢查询监控系统
该功能默认关闭
2025-12-17 14:09:02 +08:00

516 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""数据库性能监控
提供数据库操作的性能监控和统计功能
"""
import time
from collections import deque
from dataclasses import dataclass, field
from typing import Any, Optional
from src.common.logger import get_logger
logger = get_logger("database.monitoring")
@dataclass
class SlowQueryRecord:
"""慢查询记录"""
operation_name: str
execution_time: float
timestamp: float
sql: str | None = None
args: tuple | None = None
stack_trace: str | None = None
def __str__(self) -> str:
return (
f"[{self.operation_name}] {self.execution_time:.3f}s "
f"@ {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.timestamp))}"
)
@dataclass
class OperationMetrics:
"""操作指标"""
count: int = 0
total_time: float = 0.0
min_time: float = float("inf")
max_time: float = 0.0
error_count: int = 0
last_execution_time: float | None = None
slow_query_count: int = 0 # 该操作的慢查询数
@property
def avg_time(self) -> float:
"""平均执行时间"""
return self.total_time / self.count if self.count > 0 else 0.0
def record_success(self, execution_time: float):
"""记录成功执行"""
self.count += 1
self.total_time += execution_time
self.min_time = min(self.min_time, execution_time)
self.max_time = max(self.max_time, execution_time)
self.last_execution_time = time.time()
def record_error(self):
"""记录错误"""
self.error_count += 1
def record_slow_query(self):
"""记录慢查询"""
self.slow_query_count += 1
@dataclass
class DatabaseMetrics:
"""数据库指标"""
# 操作统计
operations: dict[str, OperationMetrics] = field(default_factory=dict)
# 连接池统计
connection_acquired: int = 0
connection_released: int = 0
connection_errors: int = 0
# 缓存统计
cache_hits: int = 0
cache_misses: int = 0
cache_sets: int = 0
cache_invalidations: int = 0
# 批处理统计
batch_operations: int = 0
batch_items_total: int = 0
batch_avg_size: float = 0.0
# 慢查询统计
slow_query_count: int = 0
slow_query_threshold: float = 0.5 # 慢查询阈值
@property
def cache_hit_rate(self) -> float:
"""缓存命中率"""
total = self.cache_hits + self.cache_misses
return self.cache_hits / total if total > 0 else 0.0
@property
def error_rate(self) -> float:
"""错误率"""
total_ops = sum(m.count for m in self.operations.values())
total_errors = sum(m.error_count for m in self.operations.values())
return total_errors / total_ops if total_ops > 0 else 0.0
def get_operation_metrics(self, operation_name: str) -> OperationMetrics:
"""获取操作指标"""
if operation_name not in self.operations:
self.operations[operation_name] = OperationMetrics()
return self.operations[operation_name]
class DatabaseMonitor:
"""数据库监控器
单例模式,收集和报告数据库性能指标
"""
_instance: Optional["DatabaseMonitor"] = None
_metrics: DatabaseMetrics
_slow_queries: deque # 最近的慢查询记录
_slow_query_buffer_size: int = 100
_enabled: bool = False # 慢查询监控是否启用
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._metrics = DatabaseMetrics()
cls._instance._slow_queries = deque(maxlen=cls._slow_query_buffer_size)
cls._instance._enabled = False
return cls._instance
def enable(self):
"""启用慢查询监控"""
self._enabled = True
logger.info("✅ 慢查询监控已启用")
def disable(self):
"""禁用慢查询监控"""
self._enabled = False
logger.info("❌ 慢查询监控已禁用")
def is_enabled(self) -> bool:
"""检查慢查询监控是否启用"""
return self._enabled
def set_slow_query_config(self, threshold: float, buffer_size: int):
"""设置慢查询配置"""
self._metrics.slow_query_threshold = threshold
self._slow_query_buffer_size = buffer_size
self._slow_queries = deque(maxlen=buffer_size)
# 设置配置时自动启用
self._enabled = True
def record_operation(
self,
operation_name: str,
execution_time: float,
success: bool = True,
sql: str | None = None,
):
"""记录操作"""
metrics = self._metrics.get_operation_metrics(operation_name)
if success:
metrics.record_success(execution_time)
# 只在启用时检查是否为慢查询
if self._enabled and execution_time > self._metrics.slow_query_threshold:
self.record_slow_query(operation_name, execution_time, sql)
else:
metrics.record_error()
def record_slow_query(
self,
operation_name: str,
execution_time: float,
sql: str | None = None,
args: tuple | None = None,
stack_trace: str | None = None,
):
"""记录慢查询"""
self._metrics.slow_query_count += 1
self._metrics.get_operation_metrics(operation_name).record_slow_query()
record = SlowQueryRecord(
operation_name=operation_name,
execution_time=execution_time,
timestamp=time.time(),
sql=sql,
args=args,
stack_trace=stack_trace,
)
self._slow_queries.append(record)
# 立即记录到日志(实时告警)
logger.warning(f"🐢 慢查询: {record}")
def record_connection_acquired(self):
"""记录连接获取"""
self._metrics.connection_acquired += 1
def record_connection_released(self):
"""记录连接释放"""
self._metrics.connection_released += 1
def record_connection_error(self):
"""记录连接错误"""
self._metrics.connection_errors += 1
def record_cache_hit(self):
"""记录缓存命中"""
self._metrics.cache_hits += 1
def record_cache_miss(self):
"""记录缓存未命中"""
self._metrics.cache_misses += 1
def record_cache_set(self):
"""记录缓存设置"""
self._metrics.cache_sets += 1
def record_cache_invalidation(self):
"""记录缓存失效"""
self._metrics.cache_invalidations += 1
def record_batch_operation(self, batch_size: int):
"""记录批处理操作"""
self._metrics.batch_operations += 1
self._metrics.batch_items_total += batch_size
self._metrics.batch_avg_size = (
self._metrics.batch_items_total / self._metrics.batch_operations
)
def get_metrics(self) -> DatabaseMetrics:
"""获取指标"""
return self._metrics
def get_slow_queries(self, limit: int = 0) -> list[SlowQueryRecord]:
"""获取慢查询记录
Args:
limit: 返回数量限制0 表示返回全部
Returns:
慢查询记录列表
"""
records = list(self._slow_queries)
if limit > 0:
records = records[-limit:]
return records
def get_slow_query_report(self) -> dict[str, Any]:
"""获取慢查询报告"""
slow_queries = list(self._slow_queries)
if not slow_queries:
return {
"total": 0,
"threshold": f"{self._metrics.slow_query_threshold:.3f}s",
"top_operations": [],
"recent_queries": [],
}
# 按操作分组统计
operation_stats = {}
for record in slow_queries:
if record.operation_name not in operation_stats:
operation_stats[record.operation_name] = {
"count": 0,
"total_time": 0.0,
"max_time": 0.0,
"min_time": float("inf"),
}
stats = operation_stats[record.operation_name]
stats["count"] += 1
stats["total_time"] += record.execution_time
stats["max_time"] = max(stats["max_time"], record.execution_time)
stats["min_time"] = min(stats["min_time"], record.execution_time)
# 按慢查询数排序
top_operations = sorted(
operation_stats.items(),
key=lambda x: x[1]["count"],
reverse=True,
)[:10]
return {
"total": len(slow_queries),
"threshold": f"{self._metrics.slow_query_threshold:.3f}s",
"top_operations": [
{
"operation": op_name,
"count": stats["count"],
"avg_time": f"{stats['total_time'] / stats['count']:.3f}s",
"max_time": f"{stats['max_time']:.3f}s",
"min_time": f"{stats['min_time']:.3f}s",
}
for op_name, stats in top_operations
],
"recent_queries": [
{
"operation": record.operation_name,
"time": f"{record.execution_time:.3f}s",
"timestamp": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(record.timestamp),
),
}
for record in slow_queries[-20:]
],
}
def get_summary(self) -> dict[str, Any]:
"""获取统计摘要"""
metrics = self._metrics
operation_summary = {}
for op_name, op_metrics in metrics.operations.items():
operation_summary[op_name] = {
"count": op_metrics.count,
"avg_time": f"{op_metrics.avg_time:.3f}s",
"min_time": f"{op_metrics.min_time:.3f}s",
"max_time": f"{op_metrics.max_time:.3f}s",
"error_count": op_metrics.error_count,
"slow_query_count": op_metrics.slow_query_count,
}
return {
"operations": operation_summary,
"connections": {
"acquired": metrics.connection_acquired,
"released": metrics.connection_released,
"errors": metrics.connection_errors,
"active": metrics.connection_acquired - metrics.connection_released,
},
"cache": {
"hits": metrics.cache_hits,
"misses": metrics.cache_misses,
"sets": metrics.cache_sets,
"invalidations": metrics.cache_invalidations,
"hit_rate": f"{metrics.cache_hit_rate:.2%}",
},
"batch": {
"operations": metrics.batch_operations,
"total_items": metrics.batch_items_total,
"avg_size": f"{metrics.batch_avg_size:.1f}",
},
"overall": {
"error_rate": f"{metrics.error_rate:.2%}",
"slow_query_count": metrics.slow_query_count,
"slow_query_threshold": f"{metrics.slow_query_threshold:.3f}s",
},
}
def print_summary(self):
"""打印统计摘要"""
summary = self.get_summary()
logger.info("=" * 60)
logger.info("数据库性能统计")
logger.info("=" * 60)
# 操作统计
if summary["operations"]:
logger.info("\n操作统计:")
for op_name, stats in summary["operations"].items():
logger.info(
f" {op_name}: "
f"次数={stats['count']}, "
f"平均={stats['avg_time']}, "
f"最小={stats['min_time']}, "
f"最大={stats['max_time']}, "
f"错误={stats['error_count']}, "
f"慢查询={stats['slow_query_count']}"
)
# 连接池统计
logger.info("\n连接池:")
conn = summary["connections"]
logger.info(
f" 获取={conn['acquired']}, "
f"释放={conn['released']}, "
f"活跃={conn['active']}, "
f"错误={conn['errors']}"
)
# 缓存统计
logger.info("\n缓存:")
cache = summary["cache"]
logger.info(
f" 命中={cache['hits']}, "
f"未命中={cache['misses']}, "
f"设置={cache['sets']}, "
f"失效={cache['invalidations']}, "
f"命中率={cache['hit_rate']}"
)
# 批处理统计
logger.info("\n批处理:")
batch = summary["batch"]
logger.info(
f" 操作={batch['operations']}, "
f"总项目={batch['total_items']}, "
f"平均大小={batch['avg_size']}"
)
# 整体统计
logger.info("\n整体:")
overall = summary["overall"]
logger.info(f" 错误率={overall['error_rate']}")
logger.info(f" 慢查询总数={overall['slow_query_count']}")
logger.info(f" 慢查询阈值={overall['slow_query_threshold']}")
# 慢查询报告
if overall["slow_query_count"] > 0:
logger.info("\n🐢 慢查询报告:")
slow_report = self.get_slow_query_report()
if slow_report["top_operations"]:
logger.info(" 按操作排名Top 10:")
for idx, op in enumerate(slow_report["top_operations"], 1):
logger.info(
f" {idx}. {op['operation']}: "
f"次数={op['count']}, "
f"平均={op['avg_time']}, "
f"最大={op['max_time']}"
)
logger.info("=" * 60)
def reset(self):
"""重置统计"""
self._metrics = DatabaseMetrics()
logger.info("数据库监控统计已重置")
# 全局监控器实例
_monitor: DatabaseMonitor | None = None
def get_monitor() -> DatabaseMonitor:
"""获取监控器实例"""
global _monitor
if _monitor is None:
_monitor = DatabaseMonitor()
return _monitor
# 便捷函数
def record_operation(operation_name: str, execution_time: float, success: bool = True):
"""记录操作"""
get_monitor().record_operation(operation_name, execution_time, success)
def record_slow_query(
operation_name: str,
execution_time: float,
sql: str | None = None,
args: tuple | None = None,
):
"""记录慢查询"""
get_monitor().record_slow_query(operation_name, execution_time, sql, args)
def get_slow_queries(limit: int = 0) -> list[SlowQueryRecord]:
"""获取慢查询记录"""
return get_monitor().get_slow_queries(limit)
def get_slow_query_report() -> dict[str, Any]:
"""获取慢查询报告"""
return get_monitor().get_slow_query_report()
def set_slow_query_config(threshold: float, buffer_size: int):
"""设置慢查询配置"""
get_monitor().set_slow_query_config(threshold, buffer_size)
def enable_slow_query_monitoring():
"""启用慢查询监控"""
get_monitor().enable()
def disable_slow_query_monitoring():
"""禁用慢查询监控"""
get_monitor().disable()
def is_slow_query_monitoring_enabled() -> bool:
"""检查慢查询监控是否启用"""
return get_monitor().is_enabled()
def record_cache_hit():
"""记录缓存命中"""
get_monitor().record_cache_hit()
def record_cache_miss():
"""记录缓存未命中"""
get_monitor().record_cache_miss()
def print_stats():
"""打印统计信息"""
get_monitor().print_summary()
def reset_stats():
"""重置统计"""
get_monitor().reset()