feat(exception-handling): 增强全局异常处理,添加线程和 asyncio 异常捕获

This commit is contained in:
Windpicker-owo
2025-12-04 18:34:37 +08:00
parent 4e0012dba9
commit cc91e1a2a5
3 changed files with 210 additions and 19 deletions

182
bot.py
View File

@@ -1,12 +1,17 @@
import asyncio import asyncio
import faulthandler
import os import os
import platform import platform
import sys import sys
import threading
import time import time
import traceback import traceback
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from pathlib import Path from pathlib import Path
# 启用 faulthandler 以便在段错误等致命错误时打印 traceback
faulthandler.enable()
# 初始化基础工具 # 初始化基础工具
from colorama import Fore, init from colorama import Fore, init
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -20,6 +25,88 @@ initialize_logging()
logger = get_logger("main") logger = get_logger("main")
install(extra_lines=3) install(extra_lines=3)
# ============= 全局异常捕获系统 =============
def _global_exception_handler(exc_type, exc_value, exc_tb):
"""全局异常处理器 - 捕获主线程中未处理的异常"""
if issubclass(exc_type, KeyboardInterrupt):
# 对 Ctrl+C 使用默认处理
sys.__excepthook__(exc_type, exc_value, exc_tb)
return
# 格式化异常信息
error_msg = "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
# 尝试通过日志系统记录
try:
logger.critical(f"未捕获的致命异常:\n{error_msg}")
except Exception:
pass
# 同时直接输出到 stderr确保即使日志系统失败也能看到错误
print(f"\n{'='*60}", file=sys.stderr)
print("致命错误 - 未捕获的异常:", file=sys.stderr)
print(f"{'='*60}", file=sys.stderr)
print(error_msg, file=sys.stderr)
print(f"{'='*60}\n", file=sys.stderr)
sys.stderr.flush()
def _thread_exception_handler(args):
"""线程异常处理器 - 捕获子线程中未处理的异常"""
exc_type = args.exc_type
exc_value = args.exc_value
exc_tb = args.exc_traceback
thread = args.thread
if issubclass(exc_type, SystemExit):
return
thread_name = thread.name if thread else "Unknown"
error_msg = "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
# 尝试通过日志系统记录
try:
logger.critical(f"线程 '{thread_name}' 中发生未捕获的异常:\n{error_msg}")
except Exception:
pass
# 同时直接输出到 stderr
print(f"\n{'='*60}", file=sys.stderr)
print(f"致命错误 - 线程 '{thread_name}' 中未捕获的异常:", file=sys.stderr)
print(f"{'='*60}", file=sys.stderr)
print(error_msg, file=sys.stderr)
print(f"{'='*60}\n", file=sys.stderr)
sys.stderr.flush()
def _unraisable_exception_handler(unraisable):
"""不可抛出异常处理器 - 捕获 __del__ 等中的异常"""
exc_type = unraisable.exc_type
exc_value = unraisable.exc_value
exc_tb = unraisable.exc_traceback
obj = unraisable.object
error_msg = "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
obj_repr = repr(obj) if obj else "N/A"
# 尝试通过日志系统记录
try:
logger.error(f"不可抛出的异常 (对象: {obj_repr}):\n{error_msg}")
except Exception:
pass
# 输出到 stderr
print(f"\n警告 - 不可抛出的异常 (对象: {obj_repr}):", file=sys.stderr)
print(error_msg, file=sys.stderr)
sys.stderr.flush()
# 安装全局异常处理器
sys.excepthook = _global_exception_handler
threading.excepthook = _thread_exception_handler
sys.unraisablehook = _unraisable_exception_handler
# 常量定义 # 常量定义
SUPPORTED_DATABASES = ["sqlite", "mysql", "postgresql"] SUPPORTED_DATABASES = ["sqlite", "mysql", "postgresql"]
SHUTDOWN_TIMEOUT = 10.0 SHUTDOWN_TIMEOUT = 10.0
@@ -260,6 +347,31 @@ async def create_event_loop_context():
try: try:
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
# 设置 asyncio 异常处理器
def asyncio_exception_handler(loop, context):
"""asyncio 异常处理器 - 捕获事件循环中未处理的异常"""
exception = context.get("exception")
message = context.get("message", "")
if exception:
error_msg = "".join(traceback.format_exception(type(exception), exception, exception.__traceback__))
try:
logger.error(f"asyncio 未处理异常: {message}\n{error_msg}")
except Exception:
pass
print(f"\nasyncio 未处理异常: {message}", file=sys.stderr)
print(error_msg, file=sys.stderr)
sys.stderr.flush()
else:
try:
logger.error(f"asyncio 错误: {message}")
except Exception:
pass
print(f"\nasyncio 错误: {message}", file=sys.stderr)
sys.stderr.flush()
loop.set_exception_handler(asyncio_exception_handler)
yield loop yield loop
except Exception as e: except Exception as e:
logger.error(f"创建事件循环失败: {e}") logger.error(f"创建事件循环失败: {e}")
@@ -267,9 +379,13 @@ async def create_event_loop_context():
finally: finally:
if loop and not loop.is_closed(): if loop and not loop.is_closed():
try: try:
await ShutdownManager.graceful_shutdown(loop) # 执行优雅关闭
# 注意:在 finally 中不能直接 await需要使用 loop.run_until_complete
shutdown_coro = ShutdownManager.graceful_shutdown(loop)
loop.run_until_complete(shutdown_coro)
except Exception as e: except Exception as e:
logger.error(f"关闭事件循环时出错: {e}") logger.error(f"关闭事件循环时出错: {e}")
print(f"关闭事件循环时出错: {e}", file=sys.stderr)
finally: finally:
try: try:
loop.close() loop.close()
@@ -627,11 +743,18 @@ async def main_async():
exit_code = 0 exit_code = 0
main_task = None main_task = None
# 在进入事件循环上下文之前先进行基本检查
try:
ConfigManager.ensure_env_file()
except Exception as e:
logger.critical(f"环境文件检查失败: {e}")
print(f"致命错误 - 环境文件检查失败: {e}", file=sys.stderr)
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
return 1
async with create_event_loop_context(): async with create_event_loop_context():
try: try:
# 确保环境文件存在
ConfigManager.ensure_env_file()
# 启动 WebUI 开发服务器(成功/失败都继续后续步骤) # 启动 WebUI 开发服务器(成功/失败都继续后续步骤)
webui_ok = await WebUIManager.start_dev_server(timeout=60) webui_ok = await WebUIManager.start_dev_server(timeout=60)
if webui_ok: if webui_ok:
@@ -654,6 +777,14 @@ async def main_async():
# 使用wait等待任意一个任务完成 # 使用wait等待任意一个任务完成
done, _pending = await asyncio.wait([main_task, user_input_done], return_when=asyncio.FIRST_COMPLETED) done, _pending = await asyncio.wait([main_task, user_input_done], return_when=asyncio.FIRST_COMPLETED)
# 检查已完成的任务是否有异常
for task in done:
exc = task.exception()
if exc is not None:
error_msg = "".join(traceback.format_exception(type(exc), exc, exc.__traceback__))
logger.error(f"任务异常退出:\n{error_msg}")
exit_code = 1
# 如果用户输入任务完成用户按了Ctrl+C取消主任务 # 如果用户输入任务完成用户按了Ctrl+C取消主任务
if user_input_done in done and main_task not in done: if user_input_done in done and main_task not in done:
logger.info("用户请求退出,正在取消主任务...") logger.info("用户请求退出,正在取消主任务...")
@@ -669,9 +800,14 @@ async def main_async():
logger.warning("收到中断信号,正在优雅关闭...") logger.warning("收到中断信号,正在优雅关闭...")
if main_task and not main_task.done(): if main_task and not main_task.done():
main_task.cancel() main_task.cancel()
except asyncio.CancelledError:
logger.info("主异步任务被取消")
except Exception as e: except Exception as e:
logger.error(f"主程序发生异常: {e}") error_msg = traceback.format_exc()
logger.debug(f"异常详情: {traceback.format_exc()}") logger.critical(f"主程序发生异常:\n{error_msg}")
# 同时输出到 stderr 确保可见
print(f"\n主程序异常:\n{error_msg}", file=sys.stderr)
sys.stderr.flush()
exit_code = 1 exit_code = 1
return exit_code return exit_code
@@ -684,14 +820,40 @@ if __name__ == "__main__":
except KeyboardInterrupt: except KeyboardInterrupt:
logger.info("程序被用户中断") logger.info("程序被用户中断")
exit_code = 130 exit_code = 130
except Exception as e: except SystemExit as e:
logger.error(f"程序启动失败: {e}") # 保留 SystemExit 的退出码
exit_code = e.code if isinstance(e.code, int) else 1
if exit_code != 0:
logger.warning(f"程序通过 SystemExit 退出,退出码: {exit_code}")
except BaseException as e:
# 捕获所有异常,包括 Exception 和其他 BaseException 子类
error_msg = traceback.format_exc()
try:
logger.critical(f"程序启动失败 - 致命异常:\n{error_msg}")
except Exception:
pass
# 确保错误被输出到 stderr
print(f"\n{'='*60}", file=sys.stderr)
print("致命错误 - 程序启动失败:", file=sys.stderr)
print(f"{'='*60}", file=sys.stderr)
print(error_msg, file=sys.stderr)
print(f"{'='*60}\n", file=sys.stderr)
sys.stderr.flush()
exit_code = 1 exit_code = 1
finally: finally:
# 确保日志系统正确关闭 # 确保日志系统正确关闭,并给日志队列时间刷新
try: try:
# 给日志队列一点时间来刷新
time.sleep(0.2)
shutdown_logging() shutdown_logging()
except Exception as e: except Exception as e:
print(f"关闭日志系统时出错: {e}") print(f"关闭日志系统时出错: {e}", file=sys.stderr)
# 最后刷新标准输出和错误输出
try:
sys.stdout.flush()
sys.stderr.flush()
except Exception:
pass
sys.exit(exit_code) sys.exit(exit_code)

View File

@@ -1308,11 +1308,24 @@ def shutdown_logging():
logger = get_logger("logger") logger = get_logger("logger")
logger.info("正在关闭日志系统...") logger.info("正在关闭日志系统...")
# 给日志队列一点时间来刷新
import time
time.sleep(0.1)
# 停止队列监听器(这会等待队列清空)
_stop_queue_logging()
# 关闭所有handler # 关闭所有handler
root_logger = logging.getLogger() root_logger = logging.getLogger()
for handler in root_logger.handlers[:]: for handler in root_logger.handlers[:]:
try:
# 先刷新 handler
if hasattr(handler, "flush"):
handler.flush()
if hasattr(handler, "close"): if hasattr(handler, "close"):
handler.close() handler.close()
except Exception as e:
print(f"[日志系统] 关闭 handler 时出错: {e}")
root_logger.removeHandler(handler) root_logger.removeHandler(handler)
# 关闭全局handler # 关闭全局handler
@@ -1323,8 +1336,14 @@ def shutdown_logging():
for logger_obj in logger_dict.values(): for logger_obj in logger_dict.values():
if isinstance(logger_obj, logging.Logger): if isinstance(logger_obj, logging.Logger):
for handler in logger_obj.handlers[:]: for handler in logger_obj.handlers[:]:
try:
if hasattr(handler, "flush"):
handler.flush()
if hasattr(handler, "close"): if hasattr(handler, "close"):
handler.close() handler.close()
except Exception as e:
print(f"[日志系统] 关闭 logger handler 时出错: {e}")
logger_obj.removeHandler(handler) logger_obj.removeHandler(handler)
logger.info("日志系统已关闭") # 最终输出(直接到 stderr 因为日志系统已关闭
print("[日志系统] 日志系统已关闭")

View File

@@ -678,9 +678,16 @@ async def main() -> None:
await system.schedule_tasks() await system.schedule_tasks()
except KeyboardInterrupt: except KeyboardInterrupt:
logger.info("收到键盘中断信号") logger.info("收到键盘中断信号")
except asyncio.CancelledError:
logger.info("主任务被取消")
except Exception as e: except Exception as e:
logger.error(f"主函数执行失败: {e}") error_msg = traceback.format_exc()
logger.error(traceback.format_exc()) logger.critical(f"主函数执行失败:\n{error_msg}")
# 同时输出到 stderr 确保即使日志系统异常也能看到
import sys
print(f"\n主函数致命错误:\n{error_msg}", file=sys.stderr)
sys.stderr.flush()
raise # 重新抛出以便上层处理
finally: finally:
await system.shutdown() await system.shutdown()
@@ -691,6 +698,9 @@ if __name__ == "__main__":
except KeyboardInterrupt: except KeyboardInterrupt:
logger.info("程序被用户中断") logger.info("程序被用户中断")
except Exception as e: except Exception as e:
logger.error(f"程序执行失败: {e}") error_msg = traceback.format_exc()
logger.error(traceback.format_exc()) logger.critical(f"程序执行失败:\n{error_msg}")
import sys
print(f"\n程序致命错误:\n{error_msg}", file=sys.stderr)
sys.stderr.flush()
sys.exit(1) sys.exit(1)