diff --git a/requirements.txt b/requirements.txt index 052fa4b06..6c7edfc80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,4 +71,7 @@ python-multipart aiofiles inkfox soundfile -pedalboard \ No newline at end of file +pedalboard + +# For local speech-to-text functionality (stt_whisper_plugin) +openai-whisper \ No newline at end of file diff --git a/src/chat/utils/utils_voice.py b/src/chat/utils/utils_voice.py index eae96e5f3..483acefd2 100644 --- a/src/chat/utils/utils_voice.py +++ b/src/chat/utils/utils_voice.py @@ -14,6 +14,47 @@ async def get_voice_text(voice_base64: str) -> str: if not global_config.voice.enable_asr: logger.warning("语音识别未启用,无法处理语音消息") return "[语音]" + + asr_provider = global_config.voice.asr_provider + + # 如果选择本地识别 + if asr_provider == "local": + from src.plugin_system.apis import tool_api + import tempfile + import base64 + import os + + local_asr_tool = tool_api.get_tool_instance("local_asr") + if not local_asr_tool: + logger.error("ASR provider 设置为 'local' 但未找到 'local_asr' 工具,请检查 stt_whisper_plugin 是否已加载。") + return "[语音(本地识别工具未找到)]" + + audio_path = None + try: + audio_data = base64.b64decode(voice_base64) + with tempfile.NamedTemporaryFile(delete=False, suffix=".amr") as tmp_audio_file: + tmp_audio_file.write(audio_data) + audio_path = tmp_audio_file.name + + text = await local_asr_tool.execute(function_args={"audio_path": audio_path}) + if "失败" in text or "出错" in text or "错误" in text: + logger.warning(f"本地语音识别失败: {text}") + return f"[语音(本地识别失败)]" + + logger.info(f"本地语音识别成功: {text}") + return f"[语音] {text}" + + except Exception as e: + logger.error(f"本地语音转文字失败: {e!s}") + return "[语音(本地识别出错)]" + finally: + if audio_path and os.path.exists(audio_path): + try: + os.remove(audio_path) + except Exception as e: + logger.error(f"清理临时音频文件失败: {e}") + + # 默认使用 API 识别 try: _llm = LLMRequest(model_set=model_config.model_task_config.voice, request_type="audio") text = await _llm.generate_response_for_voice(voice_base64) diff --git a/src/config/official_configs.py b/src/config/official_configs.py index d7cc3852c..72e2b76ce 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -257,6 +257,7 @@ class VoiceConfig(ValidatedConfigBase): """语音识别配置类""" enable_asr: bool = Field(default=False, description="启用语音识别") + asr_provider: str = Field(default="api", description="语音识别提供商") class EmojiConfig(ValidatedConfigBase): diff --git a/src/plugins/built_in/stt_whisper_plugin/__init__.py b/src/plugins/built_in/stt_whisper_plugin/__init__.py new file mode 100644 index 000000000..bd7cc2259 --- /dev/null +++ b/src/plugins/built_in/stt_whisper_plugin/__init__.py @@ -0,0 +1,9 @@ +from src.plugin_system.base.plugin_metadata import PluginMetadata + +__plugin_meta__ = PluginMetadata( + name="Whisper本地语音识别", + description="通过OpenAI Whisper模型提供本地语音转文字的功能", + usage="在 bot_config.toml 中将 asr_provider 设置为 'local' 即可启用", + version="0.1.0", + author="Elysia", +) \ No newline at end of file diff --git a/src/plugins/built_in/stt_whisper_plugin/plugin.py b/src/plugins/built_in/stt_whisper_plugin/plugin.py new file mode 100644 index 000000000..fb5ea38a7 --- /dev/null +++ b/src/plugins/built_in/stt_whisper_plugin/plugin.py @@ -0,0 +1,115 @@ +import asyncio +import os +import tempfile +from typing import Any +from pathlib import Path +import toml + +import whisper + +from src.common.logger import get_logger +from src.plugin_system import BasePlugin, ComponentInfo, register_plugin +from src.plugin_system.base.base_tool import BaseTool +from src.plugin_system.base.component_types import ComponentType, ToolInfo + +logger = get_logger("stt_whisper_plugin") + +# 全局变量来缓存模型,避免重复加载 +_whisper_model = None +_is_loading = False + +class LocalASRTool(BaseTool): + """ + 本地语音识别工具 + """ + tool_name = "local_asr" + tool_description = "将本地音频文件路径转换为文字。" + tool_parameters = [ + {"name": "audio_path", "type": "string", "description": "需要识别的音频文件路径", "required": True} + ] + + @classmethod + async def load_model_once(cls, plugin_config: dict): + """ + 一个类方法,用于在插件加载时触发一次模型加载。 + """ + global _whisper_model, _is_loading + if _whisper_model is None and not _is_loading: + _is_loading = True + try: + model_size = plugin_config.get("whisper", {}).get("model_size", "tiny") + device = plugin_config.get("whisper", {}).get("device", "cpu") + logger.info(f"正在预加载 Whisper ASR 模型: {model_size} ({device})") + + loop = asyncio.get_running_loop() + _whisper_model = await loop.run_in_executor( + None, whisper.load_model, model_size, device + ) + logger.info(f"Whisper ASR 模型 '{model_size}' 预加载成功!") + except Exception as e: + logger.error(f"预加载 Whisper ASR 模型失败: {e}") + _whisper_model = None + finally: + _is_loading = False + + async def execute(self, function_args: dict) -> str: + audio_path = function_args.get("audio_path") + if not audio_path: + return "错误:缺少 audio_path 参数。" + + global _whisper_model + # 增强的等待逻辑:只要模型还没准备好,就一直等待后台加载任务完成 + while _is_loading: + await asyncio.sleep(0.2) + + if _whisper_model is None: + return "Whisper 模型加载失败,无法识别语音。" + + try: + logger.info(f"开始使用 Whisper 识别音频: {audio_path}") + loop = asyncio.get_running_loop() + result = await loop.run_in_executor( + None, _whisper_model.transcribe, audio_path + ) + text_result = result.get("text", "") + text = str(text_result).strip() + logger.info(f"音频识别成功: {text}") + return text + except Exception as e: + logger.error(f"使用 Whisper 识别音频失败: {e}") + return f"语音识别出错: {e}" + +@register_plugin +class STTWhisperPlugin(BasePlugin): + plugin_name = "stt_whisper_plugin" + config_file_name = "config.toml" + python_dependencies = ["openai-whisper"] + + async def on_plugin_loaded(self): + """ + 插件加载完成后的钩子,用于触发模型预加载。 + """ + try: + from src.config.config import global_config + if global_config.voice.asr_provider == "local": + # 使用 create_task 在后台开始加载,不阻塞主流程 + asyncio.create_task(LocalASRTool.load_model_once(self.config or {})) + except Exception as e: + logger.error(f"触发 Whisper 模型预加载时出错: {e}") + + def get_plugin_components(self) -> list[tuple[ComponentInfo, type]]: + """根据主配置动态注册组件""" + try: + from src.config.config import global_config + if global_config.voice.asr_provider == "local": + logger.info("ASR provider is 'local', enabling local_asr tool.") + return [(ToolInfo( + name=LocalASRTool.tool_name, + description=LocalASRTool.tool_description, + component_type=ComponentType.TOOL + ), LocalASRTool)] + except Exception as e: + logger.error(f"检查 ASR provider 配置时出错: {e}") + + logger.debug("ASR provider is not 'local', whisper plugin's tool is disabled.") + return [] diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index a67bdfbd7..3523912b8 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -1,5 +1,5 @@ [inner] -version = "7.4.8" +version = "7.4.9" #----以下是给开发人员阅读的,如果你只是部署了MoFox-Bot,不需要阅读---- #如果你想要修改配置文件,请递增version的值 @@ -317,6 +317,9 @@ allow_reset = true # 允许重置 [voice] enable_asr = true # 是否启用语音识别,启用后MoFox-Bot可以识别语音消息,启用该功能需要配置语音识别模型[model.voice] +# [语音识别提供商] 可选值: "api", "local". 默认使用 "api". +# 注意: "local" 会消耗大量CPU资源, 可能导致低配服务器卡顿. 详情请见 stt_whisper_plugin 插件配置. +asr_provider = "api" [lpmm_knowledge] # lpmm知识库配置 enable = false # 是否启用lpmm知识库