实现慢查询监控系统

该功能默认关闭
This commit is contained in:
LuiKlee
2025-12-17 14:09:02 +08:00
parent 410614cf62
commit d6ba543b24
8 changed files with 1197 additions and 9 deletions

View File

@@ -4,6 +4,7 @@
"""
import time
from collections import deque
from dataclasses import dataclass, field
from typing import Any, Optional
@@ -12,6 +13,24 @@ from src.common.logger import get_logger
logger = get_logger("database.monitoring")
@dataclass
class SlowQueryRecord:
"""慢查询记录"""
operation_name: str
execution_time: float
timestamp: float
sql: str | None = None
args: tuple | None = None
stack_trace: str | None = None
def __str__(self) -> str:
return (
f"[{self.operation_name}] {self.execution_time:.3f}s "
f"@ {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.timestamp))}"
)
@dataclass
class OperationMetrics:
"""操作指标"""
@@ -22,6 +41,7 @@ class OperationMetrics:
max_time: float = 0.0
error_count: int = 0
last_execution_time: float | None = None
slow_query_count: int = 0 # 该操作的慢查询数
@property
def avg_time(self) -> float:
@@ -40,6 +60,10 @@ class OperationMetrics:
"""记录错误"""
self.error_count += 1
def record_slow_query(self):
"""记录慢查询"""
self.slow_query_count += 1
@dataclass
class DatabaseMetrics:
@@ -64,6 +88,10 @@ class DatabaseMetrics:
batch_items_total: int = 0
batch_avg_size: float = 0.0
# 慢查询统计
slow_query_count: int = 0
slow_query_threshold: float = 0.5 # 慢查询阈值
@property
def cache_hit_rate(self) -> float:
"""缓存命中率"""
@@ -92,26 +120,83 @@ class DatabaseMonitor:
_instance: Optional["DatabaseMonitor"] = None
_metrics: DatabaseMetrics
_slow_queries: deque # 最近的慢查询记录
_slow_query_buffer_size: int = 100
_enabled: bool = False # 慢查询监控是否启用
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._metrics = DatabaseMetrics()
cls._instance._slow_queries = deque(maxlen=cls._slow_query_buffer_size)
cls._instance._enabled = False
return cls._instance
def enable(self):
"""启用慢查询监控"""
self._enabled = True
logger.info("✅ 慢查询监控已启用")
def disable(self):
"""禁用慢查询监控"""
self._enabled = False
logger.info("❌ 慢查询监控已禁用")
def is_enabled(self) -> bool:
"""检查慢查询监控是否启用"""
return self._enabled
def set_slow_query_config(self, threshold: float, buffer_size: int):
"""设置慢查询配置"""
self._metrics.slow_query_threshold = threshold
self._slow_query_buffer_size = buffer_size
self._slow_queries = deque(maxlen=buffer_size)
# 设置配置时自动启用
self._enabled = True
def record_operation(
self,
operation_name: str,
execution_time: float,
success: bool = True,
sql: str | None = None,
):
"""记录操作"""
metrics = self._metrics.get_operation_metrics(operation_name)
if success:
metrics.record_success(execution_time)
# 只在启用时检查是否为慢查询
if self._enabled and execution_time > self._metrics.slow_query_threshold:
self.record_slow_query(operation_name, execution_time, sql)
else:
metrics.record_error()
def record_slow_query(
self,
operation_name: str,
execution_time: float,
sql: str | None = None,
args: tuple | None = None,
stack_trace: str | None = None,
):
"""记录慢查询"""
self._metrics.slow_query_count += 1
self._metrics.get_operation_metrics(operation_name).record_slow_query()
record = SlowQueryRecord(
operation_name=operation_name,
execution_time=execution_time,
timestamp=time.time(),
sql=sql,
args=args,
stack_trace=stack_trace,
)
self._slow_queries.append(record)
# 立即记录到日志(实时告警)
logger.warning(f"🐢 慢查询: {record}")
def record_connection_acquired(self):
"""记录连接获取"""
self._metrics.connection_acquired += 1
@@ -152,6 +237,81 @@ class DatabaseMonitor:
"""获取指标"""
return self._metrics
def get_slow_queries(self, limit: int = 0) -> list[SlowQueryRecord]:
"""获取慢查询记录
Args:
limit: 返回数量限制0 表示返回全部
Returns:
慢查询记录列表
"""
records = list(self._slow_queries)
if limit > 0:
records = records[-limit:]
return records
def get_slow_query_report(self) -> dict[str, Any]:
"""获取慢查询报告"""
slow_queries = list(self._slow_queries)
if not slow_queries:
return {
"total": 0,
"threshold": f"{self._metrics.slow_query_threshold:.3f}s",
"top_operations": [],
"recent_queries": [],
}
# 按操作分组统计
operation_stats = {}
for record in slow_queries:
if record.operation_name not in operation_stats:
operation_stats[record.operation_name] = {
"count": 0,
"total_time": 0.0,
"max_time": 0.0,
"min_time": float("inf"),
}
stats = operation_stats[record.operation_name]
stats["count"] += 1
stats["total_time"] += record.execution_time
stats["max_time"] = max(stats["max_time"], record.execution_time)
stats["min_time"] = min(stats["min_time"], record.execution_time)
# 按慢查询数排序
top_operations = sorted(
operation_stats.items(),
key=lambda x: x[1]["count"],
reverse=True,
)[:10]
return {
"total": len(slow_queries),
"threshold": f"{self._metrics.slow_query_threshold:.3f}s",
"top_operations": [
{
"operation": op_name,
"count": stats["count"],
"avg_time": f"{stats['total_time'] / stats['count']:.3f}s",
"max_time": f"{stats['max_time']:.3f}s",
"min_time": f"{stats['min_time']:.3f}s",
}
for op_name, stats in top_operations
],
"recent_queries": [
{
"operation": record.operation_name,
"time": f"{record.execution_time:.3f}s",
"timestamp": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(record.timestamp),
),
}
for record in slow_queries[-20:]
],
}
def get_summary(self) -> dict[str, Any]:
"""获取统计摘要"""
metrics = self._metrics
@@ -164,6 +324,7 @@ class DatabaseMonitor:
"min_time": f"{op_metrics.min_time:.3f}s",
"max_time": f"{op_metrics.max_time:.3f}s",
"error_count": op_metrics.error_count,
"slow_query_count": op_metrics.slow_query_count,
}
return {
@@ -188,6 +349,8 @@ class DatabaseMonitor:
},
"overall": {
"error_rate": f"{metrics.error_rate:.2%}",
"slow_query_count": metrics.slow_query_count,
"slow_query_threshold": f"{metrics.slow_query_threshold:.3f}s",
},
}
@@ -209,7 +372,8 @@ class DatabaseMonitor:
f"平均={stats['avg_time']}, "
f"最小={stats['min_time']}, "
f"最大={stats['max_time']}, "
f"错误={stats['error_count']}"
f"错误={stats['error_count']}, "
f"慢查询={stats['slow_query_count']}"
)
# 连接池统计
@@ -246,6 +410,24 @@ class DatabaseMonitor:
logger.info("\n整体:")
overall = summary["overall"]
logger.info(f" 错误率={overall['error_rate']}")
logger.info(f" 慢查询总数={overall['slow_query_count']}")
logger.info(f" 慢查询阈值={overall['slow_query_threshold']}")
# 慢查询报告
if overall["slow_query_count"] > 0:
logger.info("\n🐢 慢查询报告:")
slow_report = self.get_slow_query_report()
if slow_report["top_operations"]:
logger.info(" 按操作排名Top 10:")
for idx, op in enumerate(slow_report["top_operations"], 1):
logger.info(
f" {idx}. {op['operation']}: "
f"次数={op['count']}, "
f"平均={op['avg_time']}, "
f"最大={op['max_time']}"
)
logger.info("=" * 60)
@@ -273,6 +455,46 @@ def record_operation(operation_name: str, execution_time: float, success: bool =
get_monitor().record_operation(operation_name, execution_time, success)
def record_slow_query(
operation_name: str,
execution_time: float,
sql: str | None = None,
args: tuple | None = None,
):
"""记录慢查询"""
get_monitor().record_slow_query(operation_name, execution_time, sql, args)
def get_slow_queries(limit: int = 0) -> list[SlowQueryRecord]:
"""获取慢查询记录"""
return get_monitor().get_slow_queries(limit)
def get_slow_query_report() -> dict[str, Any]:
"""获取慢查询报告"""
return get_monitor().get_slow_query_report()
def set_slow_query_config(threshold: float, buffer_size: int):
"""设置慢查询配置"""
get_monitor().set_slow_query_config(threshold, buffer_size)
def enable_slow_query_monitoring():
"""启用慢查询监控"""
get_monitor().enable()
def disable_slow_query_monitoring():
"""禁用慢查询监控"""
get_monitor().disable()
def is_slow_query_monitoring_enabled() -> bool:
"""检查慢查询监控是否启用"""
return get_monitor().is_enabled()
def record_cache_hit():
"""记录缓存命中"""
get_monitor().record_cache_hit()