feat(voice): 新增本地语音识别(ASR)提供商选项
新增 `voice.asr_provider` 配置项,允许用户在 "api" 和 "local" 之间选择语音识别服务。 当设置为 "local" 时,系统将通过 `local_asr` 工具(由 `stt_whisper_plugin` 插件提供)调用本地 Whisper 模型进行语音转文字。这为用户提供了一个不依赖外部 API、注重隐私的备选方案。 - 默认值仍为 "api",保持现有行为不变。 - 添加 `openai-whisper` 作为新的依赖项以支持此功能。
This commit is contained in:
@@ -71,4 +71,7 @@ python-multipart
|
|||||||
aiofiles
|
aiofiles
|
||||||
inkfox
|
inkfox
|
||||||
soundfile
|
soundfile
|
||||||
pedalboard
|
pedalboard
|
||||||
|
|
||||||
|
# For local speech-to-text functionality (stt_whisper_plugin)
|
||||||
|
openai-whisper
|
||||||
@@ -14,6 +14,47 @@ async def get_voice_text(voice_base64: str) -> str:
|
|||||||
if not global_config.voice.enable_asr:
|
if not global_config.voice.enable_asr:
|
||||||
logger.warning("语音识别未启用,无法处理语音消息")
|
logger.warning("语音识别未启用,无法处理语音消息")
|
||||||
return "[语音]"
|
return "[语音]"
|
||||||
|
|
||||||
|
asr_provider = global_config.voice.asr_provider
|
||||||
|
|
||||||
|
# 如果选择本地识别
|
||||||
|
if asr_provider == "local":
|
||||||
|
from src.plugin_system.apis import tool_api
|
||||||
|
import tempfile
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
|
||||||
|
local_asr_tool = tool_api.get_tool_instance("local_asr")
|
||||||
|
if not local_asr_tool:
|
||||||
|
logger.error("ASR provider 设置为 'local' 但未找到 'local_asr' 工具,请检查 stt_whisper_plugin 是否已加载。")
|
||||||
|
return "[语音(本地识别工具未找到)]"
|
||||||
|
|
||||||
|
audio_path = None
|
||||||
|
try:
|
||||||
|
audio_data = base64.b64decode(voice_base64)
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".amr") as tmp_audio_file:
|
||||||
|
tmp_audio_file.write(audio_data)
|
||||||
|
audio_path = tmp_audio_file.name
|
||||||
|
|
||||||
|
text = await local_asr_tool.execute(function_args={"audio_path": audio_path})
|
||||||
|
if "失败" in text or "出错" in text or "错误" in text:
|
||||||
|
logger.warning(f"本地语音识别失败: {text}")
|
||||||
|
return f"[语音(本地识别失败)]"
|
||||||
|
|
||||||
|
logger.info(f"本地语音识别成功: {text}")
|
||||||
|
return f"[语音] {text}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"本地语音转文字失败: {e!s}")
|
||||||
|
return "[语音(本地识别出错)]"
|
||||||
|
finally:
|
||||||
|
if audio_path and os.path.exists(audio_path):
|
||||||
|
try:
|
||||||
|
os.remove(audio_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"清理临时音频文件失败: {e}")
|
||||||
|
|
||||||
|
# 默认使用 API 识别
|
||||||
try:
|
try:
|
||||||
_llm = LLMRequest(model_set=model_config.model_task_config.voice, request_type="audio")
|
_llm = LLMRequest(model_set=model_config.model_task_config.voice, request_type="audio")
|
||||||
text = await _llm.generate_response_for_voice(voice_base64)
|
text = await _llm.generate_response_for_voice(voice_base64)
|
||||||
|
|||||||
@@ -257,6 +257,7 @@ class VoiceConfig(ValidatedConfigBase):
|
|||||||
"""语音识别配置类"""
|
"""语音识别配置类"""
|
||||||
|
|
||||||
enable_asr: bool = Field(default=False, description="启用语音识别")
|
enable_asr: bool = Field(default=False, description="启用语音识别")
|
||||||
|
asr_provider: str = Field(default="api", description="语音识别提供商")
|
||||||
|
|
||||||
|
|
||||||
class EmojiConfig(ValidatedConfigBase):
|
class EmojiConfig(ValidatedConfigBase):
|
||||||
|
|||||||
9
src/plugins/built_in/stt_whisper_plugin/__init__.py
Normal file
9
src/plugins/built_in/stt_whisper_plugin/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from src.plugin_system.base.plugin_metadata import PluginMetadata
|
||||||
|
|
||||||
|
__plugin_meta__ = PluginMetadata(
|
||||||
|
name="Whisper本地语音识别",
|
||||||
|
description="通过OpenAI Whisper模型提供本地语音转文字的功能",
|
||||||
|
usage="在 bot_config.toml 中将 asr_provider 设置为 'local' 即可启用",
|
||||||
|
version="0.1.0",
|
||||||
|
author="Elysia",
|
||||||
|
)
|
||||||
115
src/plugins/built_in/stt_whisper_plugin/plugin.py
Normal file
115
src/plugins/built_in/stt_whisper_plugin/plugin.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from typing import Any
|
||||||
|
from pathlib import Path
|
||||||
|
import toml
|
||||||
|
|
||||||
|
import whisper
|
||||||
|
|
||||||
|
from src.common.logger import get_logger
|
||||||
|
from src.plugin_system import BasePlugin, ComponentInfo, register_plugin
|
||||||
|
from src.plugin_system.base.base_tool import BaseTool
|
||||||
|
from src.plugin_system.base.component_types import ComponentType, ToolInfo
|
||||||
|
|
||||||
|
logger = get_logger("stt_whisper_plugin")
|
||||||
|
|
||||||
|
# 全局变量来缓存模型,避免重复加载
|
||||||
|
_whisper_model = None
|
||||||
|
_is_loading = False
|
||||||
|
|
||||||
|
class LocalASRTool(BaseTool):
|
||||||
|
"""
|
||||||
|
本地语音识别工具
|
||||||
|
"""
|
||||||
|
tool_name = "local_asr"
|
||||||
|
tool_description = "将本地音频文件路径转换为文字。"
|
||||||
|
tool_parameters = [
|
||||||
|
{"name": "audio_path", "type": "string", "description": "需要识别的音频文件路径", "required": True}
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def load_model_once(cls, plugin_config: dict):
|
||||||
|
"""
|
||||||
|
一个类方法,用于在插件加载时触发一次模型加载。
|
||||||
|
"""
|
||||||
|
global _whisper_model, _is_loading
|
||||||
|
if _whisper_model is None and not _is_loading:
|
||||||
|
_is_loading = True
|
||||||
|
try:
|
||||||
|
model_size = plugin_config.get("whisper", {}).get("model_size", "tiny")
|
||||||
|
device = plugin_config.get("whisper", {}).get("device", "cpu")
|
||||||
|
logger.info(f"正在预加载 Whisper ASR 模型: {model_size} ({device})")
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
_whisper_model = await loop.run_in_executor(
|
||||||
|
None, whisper.load_model, model_size, device
|
||||||
|
)
|
||||||
|
logger.info(f"Whisper ASR 模型 '{model_size}' 预加载成功!")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"预加载 Whisper ASR 模型失败: {e}")
|
||||||
|
_whisper_model = None
|
||||||
|
finally:
|
||||||
|
_is_loading = False
|
||||||
|
|
||||||
|
async def execute(self, function_args: dict) -> str:
|
||||||
|
audio_path = function_args.get("audio_path")
|
||||||
|
if not audio_path:
|
||||||
|
return "错误:缺少 audio_path 参数。"
|
||||||
|
|
||||||
|
global _whisper_model
|
||||||
|
# 增强的等待逻辑:只要模型还没准备好,就一直等待后台加载任务完成
|
||||||
|
while _is_loading:
|
||||||
|
await asyncio.sleep(0.2)
|
||||||
|
|
||||||
|
if _whisper_model is None:
|
||||||
|
return "Whisper 模型加载失败,无法识别语音。"
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"开始使用 Whisper 识别音频: {audio_path}")
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
result = await loop.run_in_executor(
|
||||||
|
None, _whisper_model.transcribe, audio_path
|
||||||
|
)
|
||||||
|
text_result = result.get("text", "")
|
||||||
|
text = str(text_result).strip()
|
||||||
|
logger.info(f"音频识别成功: {text}")
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"使用 Whisper 识别音频失败: {e}")
|
||||||
|
return f"语音识别出错: {e}"
|
||||||
|
|
||||||
|
@register_plugin
|
||||||
|
class STTWhisperPlugin(BasePlugin):
|
||||||
|
plugin_name = "stt_whisper_plugin"
|
||||||
|
config_file_name = "config.toml"
|
||||||
|
python_dependencies = ["openai-whisper"]
|
||||||
|
|
||||||
|
async def on_plugin_loaded(self):
|
||||||
|
"""
|
||||||
|
插件加载完成后的钩子,用于触发模型预加载。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from src.config.config import global_config
|
||||||
|
if global_config.voice.asr_provider == "local":
|
||||||
|
# 使用 create_task 在后台开始加载,不阻塞主流程
|
||||||
|
asyncio.create_task(LocalASRTool.load_model_once(self.config or {}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"触发 Whisper 模型预加载时出错: {e}")
|
||||||
|
|
||||||
|
def get_plugin_components(self) -> list[tuple[ComponentInfo, type]]:
|
||||||
|
"""根据主配置动态注册组件"""
|
||||||
|
try:
|
||||||
|
from src.config.config import global_config
|
||||||
|
if global_config.voice.asr_provider == "local":
|
||||||
|
logger.info("ASR provider is 'local', enabling local_asr tool.")
|
||||||
|
return [(ToolInfo(
|
||||||
|
name=LocalASRTool.tool_name,
|
||||||
|
description=LocalASRTool.tool_description,
|
||||||
|
component_type=ComponentType.TOOL
|
||||||
|
), LocalASRTool)]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"检查 ASR provider 配置时出错: {e}")
|
||||||
|
|
||||||
|
logger.debug("ASR provider is not 'local', whisper plugin's tool is disabled.")
|
||||||
|
return []
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
[inner]
|
[inner]
|
||||||
version = "7.4.8"
|
version = "7.4.9"
|
||||||
|
|
||||||
#----以下是给开发人员阅读的,如果你只是部署了MoFox-Bot,不需要阅读----
|
#----以下是给开发人员阅读的,如果你只是部署了MoFox-Bot,不需要阅读----
|
||||||
#如果你想要修改配置文件,请递增version的值
|
#如果你想要修改配置文件,请递增version的值
|
||||||
@@ -317,6 +317,9 @@ allow_reset = true # 允许重置
|
|||||||
|
|
||||||
[voice]
|
[voice]
|
||||||
enable_asr = true # 是否启用语音识别,启用后MoFox-Bot可以识别语音消息,启用该功能需要配置语音识别模型[model.voice]
|
enable_asr = true # 是否启用语音识别,启用后MoFox-Bot可以识别语音消息,启用该功能需要配置语音识别模型[model.voice]
|
||||||
|
# [语音识别提供商] 可选值: "api", "local". 默认使用 "api".
|
||||||
|
# 注意: "local" 会消耗大量CPU资源, 可能导致低配服务器卡顿. 详情请见 stt_whisper_plugin 插件配置.
|
||||||
|
asr_provider = "api"
|
||||||
|
|
||||||
[lpmm_knowledge] # lpmm知识库配置
|
[lpmm_knowledge] # lpmm知识库配置
|
||||||
enable = false # 是否启用lpmm知识库
|
enable = false # 是否启用lpmm知识库
|
||||||
|
|||||||
Reference in New Issue
Block a user