This commit is contained in:
明天好像没什么
2025-11-07 21:01:45 +08:00
parent 80b040da2f
commit c8d7c09625
49 changed files with 854 additions and 872 deletions

View File

@@ -10,10 +10,10 @@
示例:
# 进程监控(启动 bot 并监控)
python scripts/memory_profiler.py --monitor --interval 10
# 对象分析(深度对象统计)
python scripts/memory_profiler.py --objects --interval 10 --output memory_data.txt
# 生成可视化图表
python scripts/memory_profiler.py --visualize --input memory_data.txt.jsonl --top 15
"""
@@ -22,7 +22,6 @@ import argparse
import asyncio
import gc
import json
import os
import subprocess
import sys
import threading
@@ -30,7 +29,6 @@ import time
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
import psutil
@@ -56,29 +54,29 @@ async def monitor_bot_process(bot_process: subprocess.Popen, interval: int = 5):
if bot_process.pid is None:
print("❌ Bot 进程 PID 为空")
return
print(f"🔍 开始监控 Bot 内存PID: {bot_process.pid}")
print(f"监控间隔: {interval}")
print("按 Ctrl+C 停止监控和 Bot\n")
try:
process = psutil.Process(bot_process.pid)
except psutil.NoSuchProcess:
print("❌ 无法找到 Bot 进程")
return
history = []
iteration = 0
try:
while bot_process.poll() is None:
try:
mem_info = process.memory_info()
mem_percent = process.memory_percent()
children = process.children(recursive=True)
children_mem = sum(child.memory_info().rss for child in children)
info = {
"timestamp": time.strftime("%H:%M:%S"),
"rss_mb": mem_info.rss / 1024 / 1024,
@@ -87,24 +85,24 @@ async def monitor_bot_process(bot_process: subprocess.Popen, interval: int = 5):
"children_count": len(children),
"children_mem_mb": children_mem / 1024 / 1024,
}
history.append(info)
iteration += 1
print(f"{'=' * 80}")
print(f"检查点 #{iteration} - {info['timestamp']}")
print(f"Bot 进程 (PID: {bot_process.pid})")
print(f" RSS: {info['rss_mb']:.2f} MB")
print(f" VMS: {info['vms_mb']:.2f} MB")
print(f" 占比: {info['percent']:.2f}%")
if children:
print(f" 子进程: {info['children_count']}")
print(f" 子进程内存: {info['children_mem_mb']:.2f} MB")
total_mem = info['rss_mb'] + info['children_mem_mb']
total_mem = info["rss_mb"] + info["children_mem_mb"]
print(f" 总内存: {total_mem:.2f} MB")
print(f"\n 📋 子进程详情:")
print("\n 📋 子进程详情:")
for idx, child in enumerate(children, 1):
try:
child_mem = child.memory_info().rss / 1024 / 1024
@@ -116,30 +114,30 @@ async def monitor_bot_process(bot_process: subprocess.Popen, interval: int = 5):
print(f" 命令: {child_cmdline}")
except (psutil.NoSuchProcess, psutil.AccessDenied):
print(f" [{idx}] 无法访问进程信息")
if len(history) > 1:
prev = history[-2]
rss_diff = info['rss_mb'] - prev['rss_mb']
print(f"\n变化:")
rss_diff = info["rss_mb"] - prev["rss_mb"]
print("\n变化:")
print(f" RSS: {rss_diff:+.2f} MB")
if rss_diff > 10:
print(f" ⚠️ 内存增长较快!")
if info['rss_mb'] > 1000:
print(f" ⚠️ 内存使用超过 1GB")
print(" ⚠️ 内存增长较快!")
if info["rss_mb"] > 1000:
print(" ⚠️ 内存使用超过 1GB")
print(f"{'=' * 80}\n")
await asyncio.sleep(interval)
except psutil.NoSuchProcess:
print("\n❌ Bot 进程已结束")
break
except Exception as e:
print(f"\n❌ 监控出错: {e}")
break
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断监控")
finally:
if history and bot_process.pid:
save_process_history(history, bot_process.pid)
@@ -149,25 +147,25 @@ def save_process_history(history: list, pid: int):
"""保存进程监控历史"""
output_dir = Path("data/memory_diagnostics")
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"process_monitor_{timestamp}_pid{pid}.txt"
with open(output_file, "w", encoding="utf-8") as f:
f.write("Bot 进程内存监控历史记录\n")
f.write("=" * 80 + "\n\n")
f.write(f"Bot PID: {pid}\n\n")
for info in history:
f.write(f"时间: {info['timestamp']}\n")
f.write(f"RSS: {info['rss_mb']:.2f} MB\n")
f.write(f"VMS: {info['vms_mb']:.2f} MB\n")
f.write(f"占比: {info['percent']:.2f}%\n")
if info['children_count'] > 0:
if info["children_count"] > 0:
f.write(f"子进程: {info['children_count']}\n")
f.write(f"子进程内存: {info['children_mem_mb']:.2f} MB\n")
f.write("\n")
print(f"\n✅ 监控历史已保存到: {output_file}")
@@ -182,28 +180,28 @@ async def run_monitor_mode(interval: int):
print(" 3. 显示子进程详细信息")
print(" 4. 自动保存监控历史")
print("=" * 80 + "\n")
project_root = Path(__file__).parent.parent
bot_file = project_root / "bot.py"
if not bot_file.exists():
print(f"❌ 找不到 bot.py: {bot_file}")
return 1
# 检测虚拟环境
venv_python = project_root / ".venv" / "Scripts" / "python.exe"
if not venv_python.exists():
venv_python = project_root / ".venv" / "bin" / "python"
if venv_python.exists():
python_exe = str(venv_python)
print(f"🐍 使用虚拟环境: {venv_python}")
else:
python_exe = sys.executable
print(f"⚠️ 未找到虚拟环境,使用当前 Python: {python_exe}")
print(f"🤖 启动 Bot: {bot_file}")
bot_process = subprocess.Popen(
[python_exe, str(bot_file)],
cwd=str(project_root),
@@ -212,9 +210,9 @@ async def run_monitor_mode(interval: int):
text=True,
bufsize=1,
)
await asyncio.sleep(2)
if bot_process.poll() is not None:
print("❌ Bot 启动失败")
if bot_process.stdout:
@@ -222,9 +220,9 @@ async def run_monitor_mode(interval: int):
if output:
print(f"\nBot 输出:\n{output}")
return 1
print(f"✅ Bot 已启动 (PID: {bot_process.pid})\n")
# 启动输出读取线程
def read_bot_output():
if bot_process.stdout:
@@ -233,15 +231,15 @@ async def run_monitor_mode(interval: int):
print(f"[Bot] {line}", end="")
except Exception:
pass
output_thread = threading.Thread(target=read_bot_output, daemon=True)
output_thread.start()
try:
await monitor_bot_process(bot_process, interval)
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断")
if bot_process.poll() is None:
print("\n正在停止 Bot...")
bot_process.terminate()
@@ -251,9 +249,9 @@ async def run_monitor_mode(interval: int):
print("⚠️ 强制终止 Bot...")
bot_process.kill()
bot_process.wait()
print("✅ Bot 已停止")
return 0
@@ -263,8 +261,8 @@ async def run_monitor_mode(interval: int):
class ObjectMemoryProfiler:
"""对象级内存分析器"""
def __init__(self, interval: int = 10, output_file: Optional[str] = None, object_limit: int = 20):
def __init__(self, interval: int = 10, output_file: str | None = None, object_limit: int = 20):
self.interval = interval
self.output_file = output_file
self.object_limit = object_limit
@@ -273,23 +271,23 @@ class ObjectMemoryProfiler:
if PYMPLER_AVAILABLE:
self.tracker = tracker.SummaryTracker()
self.iteration = 0
def get_object_stats(self) -> Dict:
def get_object_stats(self) -> dict:
"""获取当前进程的对象统计(所有线程)"""
if not PYMPLER_AVAILABLE:
return {}
try:
gc.collect()
all_objects = muppy.get_objects()
sum_data = summary.summarize(all_objects)
# 按总大小第3个元素降序排序
sorted_sum_data = sorted(sum_data, key=lambda x: x[2], reverse=True)
# 按模块统计内存
module_stats = self._get_module_stats(all_objects)
threads = threading.enumerate()
thread_info = [
{
@@ -299,13 +297,13 @@ class ObjectMemoryProfiler:
}
for t in threads
]
gc_stats = {
"collections": gc.get_count(),
"garbage": len(gc.garbage),
"tracked": len(gc.get_objects()),
}
return {
"summary": sorted_sum_data[:self.object_limit],
"module_stats": module_stats,
@@ -316,52 +314,52 @@ class ObjectMemoryProfiler:
except Exception as e:
print(f"❌ 获取对象统计失败: {e}")
return {}
def _get_module_stats(self, all_objects: list) -> Dict:
def _get_module_stats(self, all_objects: list) -> dict:
"""统计各模块的内存占用"""
module_mem = defaultdict(lambda: {"count": 0, "size": 0})
for obj in all_objects:
try:
# 获取对象所属模块
obj_type = type(obj)
module_name = obj_type.__module__
if module_name:
# 获取顶级模块名(例如 src.chat.xxx -> src
top_module = module_name.split('.')[0]
top_module = module_name.split(".")[0]
obj_size = sys.getsizeof(obj)
module_mem[top_module]["count"] += 1
module_mem[top_module]["size"] += obj_size
except Exception:
# 忽略无法获取大小的对象
continue
# 转换为列表并按大小排序
sorted_modules = sorted(
[(mod, stats["count"], stats["size"])
[(mod, stats["count"], stats["size"])
for mod, stats in module_mem.items()],
key=lambda x: x[2],
reverse=True
)
return {
"top_modules": sorted_modules[:20], # 前20个模块
"total_modules": len(module_mem)
}
def print_stats(self, stats: Dict, iteration: int):
def print_stats(self, stats: dict, iteration: int):
"""打印统计信息"""
print("\n" + "=" * 80)
print(f"🔍 对象级内存分析 #{iteration} - {time.strftime('%H:%M:%S')}")
print("=" * 80)
if "summary" in stats:
print(f"\n📦 对象统计 (前 {self.object_limit} 个类型):\n")
print(f"{'类型':<50} {'数量':>12} {'总大小':>15}")
print("-" * 80)
for obj_type, obj_count, obj_size in stats["summary"]:
if obj_size >= 1024 * 1024 * 1024:
size_str = f"{obj_size / 1024 / 1024 / 1024:.2f} GB"
@@ -371,14 +369,14 @@ class ObjectMemoryProfiler:
size_str = f"{obj_size / 1024:.2f} KB"
else:
size_str = f"{obj_size} B"
print(f"{obj_type:<50} {obj_count:>12,} {size_str:>15}")
if "module_stats" in stats and stats["module_stats"]:
print(f"\n📚 模块内存占用 (前 20 个模块):\n")
if stats.get("module_stats"):
print("\n📚 模块内存占用 (前 20 个模块):\n")
print(f"{'模块名':<40} {'对象数':>12} {'总内存':>15}")
print("-" * 80)
for module_name, obj_count, obj_size in stats["module_stats"]["top_modules"]:
if obj_size >= 1024 * 1024 * 1024:
size_str = f"{obj_size / 1024 / 1024 / 1024:.2f} GB"
@@ -388,46 +386,46 @@ class ObjectMemoryProfiler:
size_str = f"{obj_size / 1024:.2f} KB"
else:
size_str = f"{obj_size} B"
print(f"{module_name:<40} {obj_count:>12,} {size_str:>15}")
print(f"\n 总模块数: {stats['module_stats']['total_modules']}")
if "threads" in stats:
print(f"\n🧵 线程信息 ({len(stats['threads'])} 个):")
for idx, t in enumerate(stats["threads"], 1):
status = "" if t["alive"] else ""
daemon = "(守护)" if t["daemon"] else ""
print(f" [{idx}] {status} {t['name']} {daemon}")
if "gc_stats" in stats:
gc_stats = stats["gc_stats"]
print(f"\n🗑️ 垃圾回收:")
print("\n🗑️ 垃圾回收:")
print(f" 代 0: {gc_stats['collections'][0]:,}")
print(f" 代 1: {gc_stats['collections'][1]:,}")
print(f" 代 2: {gc_stats['collections'][2]:,}")
print(f" 追踪对象: {gc_stats['tracked']:,}")
if "total_objects" in stats:
print(f"\n📊 总对象数: {stats['total_objects']:,}")
print("=" * 80 + "\n")
def print_diff(self):
"""打印对象变化"""
if not PYMPLER_AVAILABLE or not self.tracker:
return
print("\n📈 对象变化分析:")
print("-" * 80)
self.tracker.print_diff()
print("-" * 80)
def save_to_file(self, stats: Dict):
def save_to_file(self, stats: dict):
"""保存统计信息到文件"""
if not self.output_file:
return
try:
# 保存文本
with open(self.output_file, "a", encoding="utf-8") as f:
@@ -435,91 +433,91 @@ class ObjectMemoryProfiler:
f.write(f"时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"迭代: #{self.iteration}\n")
f.write(f"{'=' * 80}\n\n")
if "summary" in stats:
f.write("对象统计:\n")
for obj_type, obj_count, obj_size in stats["summary"]:
f.write(f" {obj_type}: {obj_count:,} 个, {obj_size:,} 字节\n")
if "module_stats" in stats and stats["module_stats"]:
if stats.get("module_stats"):
f.write("\n模块统计 (前 20 个):\n")
for module_name, obj_count, obj_size in stats["module_stats"]["top_modules"]:
f.write(f" {module_name}: {obj_count:,} 个对象, {obj_size:,} 字节\n")
f.write(f"\n总对象数: {stats.get('total_objects', 0):,}\n")
f.write(f"线程数: {len(stats.get('threads', []))}\n")
# 保存 JSONL
jsonl_path = str(self.output_file) + ".jsonl"
record = {
"timestamp": time.strftime('%Y-%m-%d %H:%M:%S'),
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"iteration": self.iteration,
"total_objects": stats.get("total_objects", 0),
"threads": stats.get("threads", []),
"gc_stats": stats.get("gc_stats", {}),
"summary": [
{"type": t, "count": c, "size": s}
{"type": t, "count": c, "size": s}
for (t, c, s) in stats.get("summary", [])
],
"module_stats": stats.get("module_stats", {}),
}
with open(jsonl_path, "a", encoding="utf-8") as jf:
jf.write(json.dumps(record, ensure_ascii=False) + "\n")
if self.iteration == 1:
print(f"💾 数据保存到: {self.output_file}")
print(f"💾 结构化数据: {jsonl_path}")
except Exception as e:
print(f"⚠️ 保存文件失败: {e}")
def start_monitoring(self):
"""启动监控线程"""
self.running = True
def monitor_loop():
print(f"🚀 对象分析器已启动")
print("🚀 对象分析器已启动")
print(f" 监控间隔: {self.interval}")
print(f" 对象类型限制: {self.object_limit}")
print(f" 输出文件: {self.output_file or ''}")
print()
while self.running:
try:
self.iteration += 1
stats = self.get_object_stats()
self.print_stats(stats, self.iteration)
if self.iteration % 3 == 0 and self.tracker:
self.print_diff()
if self.output_file:
self.save_to_file(stats)
time.sleep(self.interval)
except Exception as e:
print(f"❌ 监控出错: {e}")
import traceback
traceback.print_exc()
monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
monitor_thread.start()
print(f"✓ 监控线程已启动\n")
print("✓ 监控线程已启动\n")
def stop(self):
"""停止监控"""
self.running = False
def run_objects_mode(interval: int, output: Optional[str], object_limit: int):
def run_objects_mode(interval: int, output: str | None, object_limit: int):
"""对象分析模式主函数"""
if not PYMPLER_AVAILABLE:
print("❌ pympler 未安装,无法使用对象分析模式")
print(" 安装: pip install pympler")
return 1
print("=" * 80)
print("🔬 对象分析模式")
print("=" * 80)
@@ -529,38 +527,38 @@ def run_objects_mode(interval: int, output: Optional[str], object_limit: int):
print(" 3. 显示对象变化diff")
print(" 4. 保存 JSONL 数据用于可视化")
print("=" * 80 + "\n")
# 添加项目根目录到 Python 路径
project_root = Path(__file__).parent.parent
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
print(f"✓ 已添加项目根目录到 Python 路径: {project_root}\n")
profiler = ObjectMemoryProfiler(
interval=interval,
output_file=output,
object_limit=object_limit
)
profiler.start_monitoring()
print("🤖 正在启动 Bot...\n")
try:
import bot
if hasattr(bot, 'main_async'):
if hasattr(bot, "main_async"):
asyncio.run(bot.main_async())
elif hasattr(bot, 'main'):
elif hasattr(bot, "main"):
bot.main()
else:
print("⚠️ bot.py 未找到 main_async() 或 main() 函数")
print(" Bot 模块已导入,监控线程在后台运行")
print(" 按 Ctrl+C 停止\n")
while profiler.running:
time.sleep(1)
except KeyboardInterrupt:
print("\n\n⚠️ 用户中断")
except Exception as e:
@@ -569,7 +567,7 @@ def run_objects_mode(interval: int, output: Optional[str], object_limit: int):
traceback.print_exc()
finally:
profiler.stop()
return 0
@@ -577,10 +575,10 @@ def run_objects_mode(interval: int, output: Optional[str], object_limit: int):
# 可视化模式
# ============================================================================
def load_jsonl(path: Path) -> List[Dict]:
def load_jsonl(path: Path) -> list[dict]:
"""加载 JSONL 文件"""
snapshots = []
with open(path, "r", encoding="utf-8") as f:
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
@@ -592,7 +590,7 @@ def load_jsonl(path: Path) -> List[Dict]:
return snapshots
def aggregate_top_types(snapshots: List[Dict], top_n: int = 10):
def aggregate_top_types(snapshots: list[dict], top_n: int = 10):
"""聚合前 N 个对象类型的时间序列"""
type_max = defaultdict(int)
for snap in snapshots:
@@ -600,37 +598,37 @@ def aggregate_top_types(snapshots: List[Dict], top_n: int = 10):
t = item.get("type")
s = int(item.get("size", 0))
type_max[t] = max(type_max[t], s)
top_types = sorted(type_max.items(), key=lambda kv: kv[1], reverse=True)[:top_n]
top_names = [t for t, _ in top_types]
times = []
series = {t: [] for t in top_names}
for snap in snapshots:
ts = snap.get("timestamp")
try:
times.append(datetime.strptime(ts, "%Y-%m-%d %H:%M:%S"))
except Exception:
times.append(None)
summary = {item.get("type"): int(item.get("size", 0))
summary = {item.get("type"): int(item.get("size", 0))
for item in snap.get("summary", [])}
for t in top_names:
series[t].append(summary.get(t, 0) / 1024.0 / 1024.0)
return times, series
def plot_series(times: List, series: Dict, output: Path, top_n: int):
def plot_series(times: list, series: dict, output: Path, top_n: int):
"""绘制时间序列图"""
plt.figure(figsize=(14, 8))
for name, values in series.items():
if all(v == 0 for v in values):
continue
plt.plot(times, values, marker="o", label=name, linewidth=2)
plt.xlabel("时间", fontsize=12)
plt.ylabel("内存 (MB)", fontsize=12)
plt.title(f"对象类型随时间的内存占用 (前 {top_n} 类型)", fontsize=14)
@@ -647,31 +645,31 @@ def run_visualize_mode(input_file: str, output_file: str, top: int):
print("❌ matplotlib 未安装,无法使用可视化模式")
print(" 安装: pip install matplotlib")
return 1
print("=" * 80)
print("📊 可视化模式")
print("=" * 80)
path = Path(input_file)
if not path.exists():
print(f"❌ 找不到输入文件: {path}")
return 1
print(f"📂 读取数据: {path}")
snaps = load_jsonl(path)
if not snaps:
print("❌ 未读取到任何快照数据")
return 1
print(f"✓ 读取 {len(snaps)} 个快照")
times, series = aggregate_top_types(snaps, top_n=top)
print(f"✓ 提取前 {top} 个对象类型")
output_path = Path(output_file)
plot_series(times, series, output_path, top)
return 0
@@ -693,10 +691,10 @@ def main():
使用示例:
# 进程监控(启动 bot 并监控)
python scripts/memory_profiler.py --monitor --interval 10
# 对象分析(深度对象统计)
python scripts/memory_profiler.py --objects --interval 10 --output memory_data.txt
# 生成可视化图表
python scripts/memory_profiler.py --visualize --input memory_data.txt.jsonl --top 15 --output plot.png
@@ -705,26 +703,26 @@ def main():
- 可视化模式需要: pip install matplotlib
""",
)
# 模式选择
mode_group = parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument("--monitor", "-m", action="store_true",
mode_group.add_argument("--monitor", "-m", action="store_true",
help="进程监控模式(外部监控 bot 进程)")
mode_group.add_argument("--objects", "-o", action="store_true",
mode_group.add_argument("--objects", "-o", action="store_true",
help="对象分析模式(内部统计所有对象)")
mode_group.add_argument("--visualize", "-v", action="store_true",
mode_group.add_argument("--visualize", "-v", action="store_true",
help="可视化模式(绘制 JSONL 数据)")
# 通用参数
parser.add_argument("--interval", "-i", type=int, default=10,
help="监控间隔(秒),默认 10")
# 对象分析参数
parser.add_argument("--output", type=str,
help="输出文件路径(对象分析模式)")
parser.add_argument("--object-limit", "-l", type=int, default=20,
help="对象类型显示数量,默认 20")
# 可视化参数
parser.add_argument("--input", type=str,
help="输入 JSONL 文件(可视化模式)")
@@ -732,24 +730,24 @@ def main():
help="展示前 N 个类型(可视化模式),默认 10")
parser.add_argument("--plot-output", type=str, default="memory_analysis_plot.png",
help="图表输出文件,默认 memory_analysis_plot.png")
args = parser.parse_args()
# 根据模式执行
if args.monitor:
return asyncio.run(run_monitor_mode(args.interval))
elif args.objects:
if not args.output:
print("⚠️ 建议使用 --output 指定输出文件以保存数据")
return run_objects_mode(args.interval, args.output, args.object_limit)
elif args.visualize:
if not args.input:
print("❌ 可视化模式需要 --input 参数指定 JSONL 文件")
return 1
return run_visualize_mode(args.input, args.plot_output, args.top)
return 0