feat:为tts增加Qwen-Omni接口
This commit is contained in:
@@ -2,17 +2,17 @@ from src.plugin_system.base.plugin_metadata import PluginMetadata
|
||||
|
||||
__plugin_meta__ = PluginMetadata(
|
||||
name="GPT-SoVITS 语音合成插件",
|
||||
description="基于 GPT-SoVITS 的文本转语音插件,支持多种语言和多风格语音合成。",
|
||||
description="基于 GPT-SoVITS 和 Qwen Omni 的文本转语音插件,支持多种语言和多风格语音合成。",
|
||||
usage=" ",
|
||||
version="2.0.0",
|
||||
author="靓仔",
|
||||
version="3.2.0",
|
||||
author="靓仔 & AI助手",
|
||||
license="AGPL-v3.0",
|
||||
repository_url="https://github.com/xuqian13/tts_voice_plugin",
|
||||
keywords=["tts", "语音合成", "文本转语音", "gpt-sovits", "语音", "朗读", "多风格", "语音播报"],
|
||||
keywords=["tts", "语音合成", "文本转语音", "gpt-sovits", "qwen-omni", "语音", "朗读", "多风格", "语音播报"],
|
||||
categories=["Utility", "Communication", "Accessibility"],
|
||||
extra={
|
||||
"is_built_in": False,
|
||||
"plugin_type": "tools",
|
||||
},
|
||||
python_dependencies = ["aiohttp", "soundfile", "pedalboard"]
|
||||
)
|
||||
python_dependencies = ["aiohttp", "soundfile", "pedalboard", "openai", "toml", "numpy"]
|
||||
)
|
||||
@@ -4,6 +4,7 @@ TTS 语音合成 Action
|
||||
|
||||
from pathlib import Path
|
||||
from typing import ClassVar
|
||||
import traceback
|
||||
|
||||
import toml
|
||||
|
||||
@@ -16,38 +17,134 @@ from ..services.manager import get_service
|
||||
logger = get_logger("tts_voice_plugin.action")
|
||||
|
||||
|
||||
def _create_default_config(config_file: Path) -> bool:
|
||||
"""创建默认配置文件"""
|
||||
try:
|
||||
# 确保配置目录存在
|
||||
config_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
default_config = {
|
||||
"plugin": {
|
||||
"enabled": True,
|
||||
"debug": False
|
||||
},
|
||||
"components": {
|
||||
"action_enabled": True,
|
||||
"command_enabled": True
|
||||
},
|
||||
"tts": {
|
||||
"engine": "qwen-omni",
|
||||
"server": "http://127.0.0.1:9880",
|
||||
"timeout": 60,
|
||||
"max_text_length": 500
|
||||
},
|
||||
"qwen_omni": {
|
||||
"api_key": "your-api-key-here",
|
||||
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
"model_name": "qwen-omni-turbo",
|
||||
"voice_character": "Chelsie",
|
||||
"media_format": "wav"
|
||||
},
|
||||
"tts_advanced": {
|
||||
"top_k": 5,
|
||||
"top_p": 1.0,
|
||||
"temperature": 1.0,
|
||||
"batch_size": 1,
|
||||
"split_bucket": True
|
||||
},
|
||||
"spatial_effects": {
|
||||
"enabled": False,
|
||||
"reverb_enabled": True,
|
||||
"room_size": 0.15,
|
||||
"damping": 0.5,
|
||||
"wet_level": 0.33,
|
||||
"dry_level": 0.4,
|
||||
"width": 1.0,
|
||||
"convolution_enabled": False,
|
||||
"convolution_mix": 0.5
|
||||
},
|
||||
"tts_styles": [
|
||||
{
|
||||
"style_name": "default",
|
||||
"name": "默认风格",
|
||||
"refer_wav_path": "/path/to/your/reference.wav",
|
||||
"prompt_text": "这是一个示例文本,请替换为您自己的参考音频文本。",
|
||||
"prompt_language": "zh",
|
||||
"gpt_weights": "/path/to/your/gpt_weights.pth",
|
||||
"sovits_weights": "/path/to/your/sovits_weights.pth",
|
||||
"speed_factor": 1.0,
|
||||
"text_language": "auto"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
with open(config_file, 'w', encoding='utf-8') as f:
|
||||
toml.dump(default_config, f)
|
||||
|
||||
logger.info(f"已创建默认配置文件: {config_file}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建默认配置文件失败: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
def _get_available_styles() -> list[str]:
|
||||
"""动态读取配置文件,获取所有可用的TTS风格名称"""
|
||||
try:
|
||||
# 这个路径构建逻辑是为了确保无论从哪里启动,都能准确定位到配置文件
|
||||
# 使用更稳健的路径构建方法
|
||||
plugin_file = Path(__file__).resolve()
|
||||
# Bot/src/plugins/built_in/tts_voice_plugin/actions -> Bot
|
||||
bot_root = plugin_file.parent.parent.parent.parent.parent.parent
|
||||
config_file = bot_root / "config" / "plugins" / "tts_voice_plugin" / "config.toml"
|
||||
|
||||
if not config_file.is_file():
|
||||
logger.warning("在 tts_action 中未找到 tts_voice_plugin 的配置文件,无法动态加载风格列表。")
|
||||
# 计算插件根目录: Bot/src/plugins/built_in/tts_voice_plugin/actions -> Bot/src/plugins/built_in/tts_voice_plugin
|
||||
plugin_root = plugin_file.parent.parent
|
||||
|
||||
# 尝试多种可能的配置路径
|
||||
possible_paths = [
|
||||
# 标准路径: Bot/config/plugins/tts_voice_plugin/config.toml
|
||||
plugin_root.parent.parent.parent.parent / "config" / "plugins" / "tts_voice_plugin" / "config.toml",
|
||||
# 备用路径: Bot/config/plugins/tts_voice_plugin/config.toml
|
||||
plugin_root.parent.parent.parent / "config" / "plugins" / "tts_voice_plugin" / "config.toml",
|
||||
# 开发路径: 直接在插件目录下的 config.toml
|
||||
plugin_root / "config.toml"
|
||||
]
|
||||
|
||||
config_file = None
|
||||
for path in possible_paths:
|
||||
if path.is_file():
|
||||
config_file = path
|
||||
break
|
||||
|
||||
if not config_file or not config_file.is_file():
|
||||
logger.warning("配置文件不存在,使用默认风格列表")
|
||||
return ["default"]
|
||||
|
||||
config = toml.loads(config_file.read_text(encoding="utf-8"))
|
||||
|
||||
styles_config = config.get("tts_styles", [])
|
||||
if not isinstance(styles_config, list):
|
||||
|
||||
# 检查当前使用的 TTS 引擎
|
||||
engine = config.get("tts", {}).get("engine", "gpt-sovits")
|
||||
|
||||
if engine == "qwen-omni":
|
||||
# Qwen Omni 使用默认风格
|
||||
return ["default"]
|
||||
else:
|
||||
# GPT-SoVITS 从配置中读取风格
|
||||
styles_config = config.get("tts_styles", [])
|
||||
if not isinstance(styles_config, list):
|
||||
logger.warning(f"tts_styles 配置不是列表类型: {type(styles_config)}")
|
||||
return ["default"]
|
||||
|
||||
# 使用显式循环和类型检查来提取 style_name,以确保 Pylance 类型检查通过
|
||||
style_names: list[str] = []
|
||||
for style in styles_config:
|
||||
if isinstance(style, dict):
|
||||
name = style.get("style_name")
|
||||
# 确保 name 是一个非空字符串
|
||||
if isinstance(name, str) and name:
|
||||
style_names.append(name)
|
||||
# 使用显式循环和类型检查来提取 style_name
|
||||
style_names: list[str] = []
|
||||
for style in styles_config:
|
||||
if isinstance(style, dict):
|
||||
name = style.get("style_name")
|
||||
# 确保 name 是一个非空字符串
|
||||
if isinstance(name, str) and name:
|
||||
style_names.append(name)
|
||||
|
||||
return style_names if style_names else ["default"]
|
||||
return style_names if style_names else ["default"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"动态加载TTS风格列表时出错: {e}")
|
||||
logger.error(f"动态加载TTS风格列表时出错: {e}", exc_info=True)
|
||||
return ["default"] # 出现任何错误都回退
|
||||
|
||||
|
||||
@@ -68,7 +165,7 @@ class TTSVoiceAction(BaseAction):
|
||||
parallel_action = False
|
||||
|
||||
action_parameters: ClassVar[dict] = {
|
||||
"tts_voice_text": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "需要转换为语音并发送的完整、自然、适合口语的文本内容。",
|
||||
"required": True
|
||||
@@ -101,7 +198,7 @@ class TTSVoiceAction(BaseAction):
|
||||
|
||||
action_require: ClassVar[list] = [
|
||||
"在调用此动作时,你必须在 'text' 参数中提供要合成语音的完整回复内容。这是强制性的。",
|
||||
"当用户明确请求使用语音进行回复时,例如‘发个语音听听’、‘用语音说’等。",
|
||||
"当用户明确请求使用语音进行回复时,例如'发个语音听听'、'用语音说'等。",
|
||||
"当对话内容适合用语音表达,例如讲故事、念诗、撒嬌或进行角色扮演时。",
|
||||
"在表达特殊情感(如安慰、鼓励、庆祝)的场景下,可以主动使用语音来增强感染力。",
|
||||
"不要在日常的、简短的问答或闲聊中频繁使用语音,避免打扰用户。",
|
||||
@@ -119,34 +216,35 @@ class TTSVoiceAction(BaseAction):
|
||||
"""
|
||||
判断此 Action 是否应该被激活。
|
||||
满足以下任一条件即可激活:
|
||||
1. 55% 的随机概率
|
||||
1. 25% 的随机概率
|
||||
2. 匹配到预设的关键词
|
||||
3. LLM 判断当前场景适合发送语音
|
||||
"""
|
||||
# 条件1: 随机激活
|
||||
if await self._random_activation(0.25):
|
||||
logger.info(f"{self.log_prefix} 随机激活成功 (25%)")
|
||||
return True
|
||||
try:
|
||||
# 条件1: 随机激活
|
||||
if await self._random_activation(0.25):
|
||||
logger.info(f"{self.log_prefix} 随机激活成功 (25%)")
|
||||
return True
|
||||
|
||||
# 条件2: 关键词激活
|
||||
keywords = [
|
||||
"发语音", "语音", "说句话", "用语音说", "听你", "听声音", "想你", "想听声音",
|
||||
"讲个话", "说段话", "念一下", "读一下", "用嘴说", "说", "能发语音吗", "亲口"
|
||||
]
|
||||
if await self._keyword_match(keywords):
|
||||
logger.info(f"{self.log_prefix} 关键词激活成功")
|
||||
return True
|
||||
# 条件2: 关键词激活
|
||||
keywords = [
|
||||
"发语音", "语音", "说句话", "用语音说", "听你", "听声音", "想你", "想听声音",
|
||||
"讲个话", "说段话", "念一下", "读一下", "用嘴说", "说", "能发语音吗", "亲口"
|
||||
]
|
||||
if await self._keyword_match(keywords):
|
||||
logger.info(f"{self.log_prefix} 关键词激活成功")
|
||||
return True
|
||||
|
||||
# 条件3: LLM 判断激活
|
||||
# 注意:这里我们复用 action_require 里的描述,让 LLM 的判断更精准
|
||||
if await self._llm_judge_activation(
|
||||
llm_judge_model=llm_judge_model
|
||||
):
|
||||
logger.info(f"{self.log_prefix} LLM 判断激活成功")
|
||||
return True
|
||||
# 条件3: LLM 判断激活
|
||||
if await self._llm_judge_activation(llm_judge_model=llm_judge_model):
|
||||
logger.info(f"{self.log_prefix} LLM 判断激活成功")
|
||||
return True
|
||||
|
||||
logger.debug(f"{self.log_prefix} 所有激活条件均未满足,不激活")
|
||||
return False
|
||||
logger.debug(f"{self.log_prefix} 所有激活条件均未满足,不激活")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 激活判断失败: {e}")
|
||||
return False
|
||||
|
||||
async def execute(self) -> tuple[bool, str]:
|
||||
"""
|
||||
@@ -157,36 +255,38 @@ class TTSVoiceAction(BaseAction):
|
||||
logger.error(f"{self.log_prefix} TTSService 未注册或初始化失败,静默处理。")
|
||||
return False, "TTSService 未注册或初始化失败"
|
||||
|
||||
initial_text = self.action_data.get("tts_voice_text", "").strip()
|
||||
# 尝试多种可能的参数名
|
||||
initial_text = self.action_data.get("text", "").strip()
|
||||
if not initial_text:
|
||||
initial_text = self.action_data.get("tts_voice_text", "").strip()
|
||||
|
||||
voice_style = self.action_data.get("voice_style", "default")
|
||||
# 新增:从决策模型获取指定的语言模式
|
||||
text_language = self.action_data.get("text_language") # 如果模型没给,就是 None
|
||||
logger.info(f"{self.log_prefix} 接收到规划器初步文本: '{initial_text[:70]}...', 指定风格: {voice_style}, 指定语言: {text_language}")
|
||||
text_language = self.action_data.get("text_language")
|
||||
|
||||
logger.info(f"{self.log_prefix} 接收到规划器文本: '{initial_text[:70]}...', 风格: {voice_style}, 语言: {text_language}")
|
||||
|
||||
# 1. 使用规划器提供的文本
|
||||
text = initial_text
|
||||
if not text:
|
||||
if not initial_text:
|
||||
logger.warning(f"{self.log_prefix} 规划器提供的文本为空,静默处理。")
|
||||
return False, "规划器提供的文本为空"
|
||||
|
||||
# 2. 调用 TTSService 生成语音
|
||||
logger.info(f"{self.log_prefix} 使用最终文本进行语音合成: '{text[:70]}...'")
|
||||
# 调用 TTSService 生成语音
|
||||
logger.info(f"{self.log_prefix} 使用最终文本进行语音合成: '{initial_text[:70]}...'")
|
||||
audio_b64 = await self.tts_service.generate_voice(
|
||||
text=text,
|
||||
text=initial_text,
|
||||
style_hint=voice_style,
|
||||
language_hint=text_language # 新增:将决策模型指定的语言传递给服务
|
||||
language_hint=text_language
|
||||
)
|
||||
|
||||
if audio_b64:
|
||||
# 在发送语音前,将文本注册到缓存中
|
||||
register_self_voice(audio_b64, text)
|
||||
register_self_voice(audio_b64, initial_text)
|
||||
await self.send_custom(message_type="voice", content=audio_b64)
|
||||
logger.info(f"{self.log_prefix} GPT-SoVITS语音发送成功")
|
||||
logger.info(f"{self.log_prefix} 语音发送成功")
|
||||
await self.store_action_info(
|
||||
action_prompt_display=f"将文本转换为语音并发送 (风格:{voice_style})",
|
||||
action_done=True
|
||||
)
|
||||
return True, f"成功生成并发送语音,文本长度: {len(text)}字符"
|
||||
return True, f"成功生成并发送语音,文本长度: {len(initial_text)}字符"
|
||||
else:
|
||||
logger.error(f"{self.log_prefix} TTS服务未能返回音频数据,静默处理。")
|
||||
await self.store_action_info(
|
||||
@@ -197,9 +297,9 @@ class TTSVoiceAction(BaseAction):
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 语音合成过程中发生未知错误: {e!s}")
|
||||
logger.error(traceback.format_exc())
|
||||
await self.store_action_info(
|
||||
action_prompt_display=f"语音合成失败: {e!s}",
|
||||
action_done=False
|
||||
)
|
||||
return False, f"语音合成出错: {e!s}"
|
||||
|
||||
return False, f"语音合成出错: {e!s}"
|
||||
@@ -19,7 +19,7 @@ class TTSVoiceCommand(PlusCommand):
|
||||
"""
|
||||
|
||||
command_name: str = "tts"
|
||||
command_description: str = "使用GPT-SoVITS将文本转换为语音并发送"
|
||||
command_description: str = "使用GPT-SoVITS或Qwen Omni将文本转换为语音并发送"
|
||||
command_aliases: ClassVar[list[str]] = ["语音合成", "说"]
|
||||
command_usage = "/tts <要说的文本> [风格]"
|
||||
|
||||
@@ -41,8 +41,14 @@ class TTSVoiceCommand(PlusCommand):
|
||||
if not tts_service:
|
||||
raise RuntimeError("TTSService 未注册或初始化失败")
|
||||
|
||||
# 获取可用风格列表
|
||||
available_styles = tts_service.tts_styles.keys()
|
||||
# 获取可用风格列表 - 兼容不同的 TTS 服务类型
|
||||
available_styles = []
|
||||
if hasattr(tts_service, 'tts_styles'):
|
||||
# GPT-SoVITS 服务
|
||||
available_styles = list(tts_service.tts_styles.keys())
|
||||
else:
|
||||
# Qwen Omni 服务 - 使用默认风格
|
||||
available_styles = ["default"]
|
||||
|
||||
text_to_speak = ""
|
||||
style_hint = "default"
|
||||
@@ -61,7 +67,11 @@ class TTSVoiceCommand(PlusCommand):
|
||||
await self.send_text("请提供要转换为语音的文本内容哦!")
|
||||
return False, "文本内容为空", True
|
||||
|
||||
audio_b64 = await tts_service.generate_voice(text_to_speak, style_hint)
|
||||
# 调用 TTS 服务生成语音
|
||||
audio_b64 = await tts_service.generate_voice(
|
||||
text=text_to_speak,
|
||||
style_hint=style_hint
|
||||
)
|
||||
|
||||
if audio_b64:
|
||||
await self.send_type(message_type="voice", content=audio_b64)
|
||||
@@ -73,4 +83,4 @@ class TTSVoiceCommand(PlusCommand):
|
||||
except Exception as e:
|
||||
logger.error(f"执行 /tts 命令时出错: {e}")
|
||||
await self.send_text("❌ 语音合成时发生了意想不到的错误,请查看日志。")
|
||||
return False, "命令执行异常", True
|
||||
return False, "命令执行异常", True
|
||||
@@ -1,14 +1,22 @@
|
||||
"""
|
||||
TTS Voice 插件 - 重构版
|
||||
"""
|
||||
import base64
|
||||
import io
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, ClassVar
|
||||
from typing import Any, ClassVar, Dict, AsyncIterator
|
||||
|
||||
import toml
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from openai import OpenAI
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.plugin_system import BasePlugin, ComponentInfo, register_plugin
|
||||
from src.plugin_system.base.component_types import PermissionNodeField
|
||||
from src.plugin_system.base.config_types import ConfigField
|
||||
|
||||
from .actions.tts_action import TTSVoiceAction
|
||||
from .commands.tts_command import TTSVoiceCommand
|
||||
@@ -17,18 +25,191 @@ from .services.tts_service import TTSService
|
||||
|
||||
logger = get_logger("tts_voice_plugin")
|
||||
|
||||
@dataclass
|
||||
class QwenOmniConfig:
|
||||
"""Qwen Omni TTS 配置"""
|
||||
api_key: str
|
||||
model_name: str = "qwen-omni-turbo"
|
||||
voice_character: str = "Chelsie"
|
||||
media_format: str = "wav"
|
||||
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "QwenOmniConfig":
|
||||
return cls(
|
||||
api_key=data.get("api_key", ""),
|
||||
model_name=data.get("model_name", "qwen-omni-turbo"),
|
||||
voice_character=data.get("voice_character", "Chelsie"),
|
||||
media_format=data.get("media_format", "wav"),
|
||||
base_url=data.get("base_url", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
|
||||
)
|
||||
|
||||
|
||||
class QwenOmniTTSModel:
|
||||
"""Qwen Omni TTS 模型"""
|
||||
|
||||
def __init__(self, get_config_func):
|
||||
"""初始化TTS模型
|
||||
|
||||
Args:
|
||||
get_config_func: 插件配置获取函数
|
||||
"""
|
||||
self.get_config = get_config_func
|
||||
self.config = self._load_config()
|
||||
|
||||
def _load_config(self) -> QwenOmniConfig:
|
||||
"""从插件配置加载Qwen Omni配置"""
|
||||
try:
|
||||
config_data = {
|
||||
"api_key": self.get_config("qwen_omni.api_key", ""),
|
||||
"model_name": self.get_config("qwen_omni.model_name", "qwen-omni-turbo"),
|
||||
"voice_character": self.get_config("qwen_omni.voice_character", "Chelsie"),
|
||||
"media_format": self.get_config("qwen_omni.media_format", "wav"),
|
||||
"base_url": self.get_config("qwen_omni.base_url", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
|
||||
}
|
||||
return QwenOmniConfig.from_dict(config_data)
|
||||
except Exception as e:
|
||||
logger.error(f"加载 Qwen Omni 配置失败: {e}")
|
||||
return QwenOmniConfig(api_key="")
|
||||
|
||||
async def tts(self, text: str, **kwargs) -> bytes:
|
||||
"""文本转语音 - 将PCM数据转换为WAV文件"""
|
||||
try:
|
||||
audio_base64_string = ""
|
||||
chunk_count = 0
|
||||
|
||||
async for chunk in self._tts_stream(text, **kwargs):
|
||||
audio_base64_string += chunk
|
||||
chunk_count += 1
|
||||
|
||||
if not audio_base64_string:
|
||||
logger.error("没有收到任何音频数据")
|
||||
return None
|
||||
|
||||
# 解码base64得到PCM数据
|
||||
pcm_data = base64.b64decode(audio_base64_string)
|
||||
|
||||
# 将PCM数据转换为WAV文件
|
||||
wav_bytes = self._pcm_to_wav_soundfile(pcm_data)
|
||||
|
||||
return wav_bytes
|
||||
except Exception as e:
|
||||
logger.error(f"Qwen Omni TTS 失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return None
|
||||
|
||||
def _pcm_to_wav_soundfile(self, pcm_data: bytes, sample_rate: int = 24000, channels: int = 1) -> bytes:
|
||||
"""使用soundfile将PCM数据转换为WAV文件"""
|
||||
try:
|
||||
import io
|
||||
import numpy as np
|
||||
|
||||
# 将PCM字节数据转换为numpy数组
|
||||
# 假设是16位有符号整数(这是最常见的PCM格式)
|
||||
audio_array = np.frombuffer(pcm_data, dtype=np.int16)
|
||||
|
||||
# 创建字节流
|
||||
wav_io = io.BytesIO()
|
||||
|
||||
# 使用soundfile写入WAV格式
|
||||
sf.write(wav_io, audio_array, sample_rate, format='WAV')
|
||||
|
||||
# 获取WAV文件数据
|
||||
wav_bytes = wav_io.getvalue()
|
||||
wav_io.close()
|
||||
|
||||
logger.info(f"使用soundfile转换PCM到WAV: {len(pcm_data)}字节PCM -> {len(wav_bytes)}字节WAV")
|
||||
return wav_bytes
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"使用soundfile转换PCM到WAV失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return pcm_data
|
||||
|
||||
async def _tts_stream(self, text: str, **kwargs) -> AsyncIterator[str]:
|
||||
"""使用大模型流式生成音频数据"""
|
||||
try:
|
||||
logger.info(f"开始调用Qwen Omni API生成音频,文本: {text[:30]}{'...' if len(text) > 30 else ''}")
|
||||
|
||||
prompt = f"复述这句话,不要输出其他内容,只输出'{text}'就好,不要输出其他内容,不要输出前后缀,不要输出'{text}'以外的内容,不要说:如果还有类似的需求或者想聊聊别的"
|
||||
logger.info(f"使用prompt: {prompt}")
|
||||
|
||||
client = OpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=self.config.model_name,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
modalities=["text", "audio"],
|
||||
audio={
|
||||
"voice": self.config.voice_character,
|
||||
"format": self.config.media_format,
|
||||
},
|
||||
stream=True,
|
||||
stream_options={"include_usage": True},
|
||||
)
|
||||
|
||||
audio_data_received = False
|
||||
total_audio_length = 0
|
||||
|
||||
for chunk in completion:
|
||||
if hasattr(chunk, "choices") and chunk.choices:
|
||||
delta = chunk.choices[0].delta
|
||||
|
||||
# 检查音频数据
|
||||
if hasattr(delta, "audio") and delta.audio:
|
||||
audio_dict = delta.audio
|
||||
if isinstance(audio_dict, dict) and 'data' in audio_dict and audio_dict['data']:
|
||||
audio_data = audio_dict['data']
|
||||
total_audio_length += len(audio_data)
|
||||
audio_data_received = True
|
||||
yield audio_data
|
||||
else:
|
||||
logger.debug(f"音频字典内容: {audio_dict}")
|
||||
|
||||
# 记录文本内容用于调试
|
||||
if hasattr(delta, "content") and delta.content:
|
||||
logger.debug(f"收到文本内容: {delta.content}")
|
||||
|
||||
if hasattr(chunk, "usage") and chunk.usage:
|
||||
logger.info(f"本次使用量: {chunk.usage}")
|
||||
|
||||
logger.info(f"音频数据接收完成,总base64长度: {total_audio_length}")
|
||||
if not audio_data_received:
|
||||
logger.warning("API调用成功但没有收到音频数据")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Qwen Omni API调用失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
raise
|
||||
|
||||
async def generate_voice(self, text: str, style_hint: str = "default", language_hint: str | None = None) -> str | None:
|
||||
"""生成语音的兼容接口"""
|
||||
try:
|
||||
logger.info(f"开始生成语音,文本: {text}")
|
||||
audio_data = await self.tts(text)
|
||||
if audio_data:
|
||||
logger.info(f"语音生成成功,数据长度: {len(audio_data)} 字节")
|
||||
# 直接返回base64编码的WAV数据
|
||||
return base64.b64encode(audio_data).decode("utf-8")
|
||||
else:
|
||||
logger.error("语音生成失败,audio_data 为 None")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Qwen Omni 语音生成失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return None
|
||||
|
||||
|
||||
@register_plugin
|
||||
class TTSVoicePlugin(BasePlugin):
|
||||
"""
|
||||
GPT-SoVITS 语音合成插件 - 重构版
|
||||
GPT-SoVITS 和 Qwen Omni 语音合成插件
|
||||
"""
|
||||
|
||||
plugin_name = "tts_voice_plugin"
|
||||
plugin_description = "基于GPT-SoVITS的文本转语音插件(重构版)"
|
||||
plugin_version = "3.1.2"
|
||||
plugin_author = "Kilo Code & 靚仔"
|
||||
enable_plugin = True
|
||||
plugin_description = "基于GPT-SoVITS和Qwen Omni的文本转语音插件"
|
||||
plugin_version = "3.2.0"
|
||||
plugin_author = "Kilo Code & 靓仔 & AI助手"
|
||||
config_file_name = "config.toml"
|
||||
dependencies: ClassVar[list[str]] = []
|
||||
|
||||
@@ -36,182 +217,133 @@ class TTSVoicePlugin(BasePlugin):
|
||||
PermissionNodeField(node_name="command.use", description="是否可以使用 /tts 命令"),
|
||||
]
|
||||
|
||||
config_schema: ClassVar[dict] = {}
|
||||
# 使用 ConfigField 的配置架构
|
||||
config_schema: ClassVar[dict] = {
|
||||
"plugin": {
|
||||
"enabled": ConfigField(type=bool, default=True, description="是否启用插件"),
|
||||
"debug": ConfigField(type=bool, default=False, description="是否开启调试模式")
|
||||
},
|
||||
"components": {
|
||||
"action_enabled": ConfigField(type=bool, default=True, description="是否启用TTS Action"),
|
||||
"command_enabled": ConfigField(type=bool, default=True, description="是否启用TTS命令")
|
||||
},
|
||||
"tts": {
|
||||
"server": ConfigField(type=str, default="http://127.0.0.1:9880", description="GPT-SoVITS服务器地址"),
|
||||
"timeout": ConfigField(type=int, default=60, description="TTS请求超时时间(秒)"),
|
||||
"max_text_length": ConfigField(type=int, default=500, description="最大文本长度"),
|
||||
"engine": ConfigField(
|
||||
type=str,
|
||||
default="gpt-sovits",
|
||||
description="TTS引擎选择",
|
||||
choices=["gpt-sovits", "qwen-omni"]
|
||||
)
|
||||
},
|
||||
"qwen_omni": {
|
||||
"api_key": ConfigField(type=str, default="", description="Qwen Omni API密钥", required=True),
|
||||
"base_url": ConfigField(
|
||||
type=str,
|
||||
default="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
description="Qwen Omni API基础URL"
|
||||
),
|
||||
"model_name": ConfigField(type=str, default="qwen-omni-turbo", description="Qwen Omni模型名称"),
|
||||
"voice_character": ConfigField(type=str, default="Chelsie", description="语音角色"),
|
||||
"media_format": ConfigField(type=str, default="wav", description="音频格式")
|
||||
},
|
||||
"tts_advanced": {
|
||||
"top_k": ConfigField(type=int, default=5, description="Top-K采样参数"),
|
||||
"top_p": ConfigField(type=float, default=1.0, description="Top-P采样参数"),
|
||||
"temperature": ConfigField(type=float, default=1.0, description="温度参数"),
|
||||
"batch_size": ConfigField(type=int, default=1, description="批处理大小"),
|
||||
"split_bucket": ConfigField(type=bool, default=True, description="是否启用分桶处理")
|
||||
},
|
||||
"spatial_effects": {
|
||||
"enabled": ConfigField(type=bool, default=False, description="是否启用空间音效"),
|
||||
"reverb_enabled": ConfigField(type=bool, default=True, description="是否启用混响效果"),
|
||||
"room_size": ConfigField(type=float, default=0.15, description="混响房间大小"),
|
||||
"damping": ConfigField(type=float, default=0.5, description="混响阻尼"),
|
||||
"wet_level": ConfigField(type=float, default=0.33, description="湿声比例"),
|
||||
"dry_level": ConfigField(type=float, default=0.4, description="干声比例"),
|
||||
"width": ConfigField(type=float, default=1.0, description="立体声宽度"),
|
||||
"convolution_enabled": ConfigField(type=bool, default=False, description="是否启用卷积混响"),
|
||||
"convolution_mix": ConfigField(type=float, default=0.5, description="卷积混响干湿比")
|
||||
}
|
||||
}
|
||||
|
||||
config_section_descriptions: ClassVar[dict] = {
|
||||
"plugin": "插件基本配置",
|
||||
"components": "组件启用控制",
|
||||
"components": "组件启用控制",
|
||||
"tts": "TTS语音合成基础配置",
|
||||
"tts_advanced": "TTS高级参数配置(语速、采样、批处理等)",
|
||||
"tts_styles": "TTS风格参数配置(每个分组为一种风格)"
|
||||
"qwen_omni": "Qwen Omni大模型TTS配置(需要API Key)",
|
||||
"tts_advanced": "TTS高级参数配置",
|
||||
"spatial_effects": "空间音频效果配置"
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.tts_service = None
|
||||
|
||||
def _create_default_config(self, config_file: Path):
|
||||
"""
|
||||
如果配置文件不存在,则创建一个默认的配置文件。
|
||||
"""
|
||||
if config_file.is_file():
|
||||
return
|
||||
|
||||
logger.info(f"TTS 配置文件不存在,正在创建默认配置文件于: {config_file}")
|
||||
|
||||
default_config_content = """# 插件基础配置
|
||||
[plugin]
|
||||
enable = true
|
||||
keywords = [
|
||||
"发语音", "语音", "说句话", "用语音说", "听你", "听声音", "想听你", "想听声音",
|
||||
"讲个话", "说段话", "念一下", "读一下", "用嘴说", "说", "能发语音吗","亲口"
|
||||
]
|
||||
|
||||
# 组件启用控制
|
||||
[components]
|
||||
action_enabled = true
|
||||
command_enabled = true
|
||||
|
||||
# TTS 语音合成基础配置
|
||||
[tts]
|
||||
server = "http://127.0.0.1:9880"
|
||||
timeout = 180
|
||||
max_text_length = 1000
|
||||
|
||||
# TTS 风格参数配置
|
||||
# 每个 [[tts_styles]] 代表一个独立的语音风格配置
|
||||
[[tts_styles]]
|
||||
# 风格的唯一标识符,必须有一个名为 "default"
|
||||
style_name = "default"
|
||||
# 显示名称
|
||||
name = "默认"
|
||||
# 参考音频路径
|
||||
refer_wav_path = "C:/path/to/your/reference.wav"
|
||||
# 参考音频文本
|
||||
prompt_text = "这是一个示例文本,请替换为您自己的参考音频文本。"
|
||||
# 参考音频语言
|
||||
prompt_language = "zh"
|
||||
# GPT 模型路径
|
||||
gpt_weights = "C:/path/to/your/gpt_weights.ckpt"
|
||||
# SoVITS 模型路径
|
||||
sovits_weights = "C:/path/to/your/sovits_weights.pth"
|
||||
# 语速
|
||||
speed_factor = 1.0
|
||||
|
||||
# TTS 高级参数配置
|
||||
[tts_advanced]
|
||||
media_type = "wav"
|
||||
top_k = 9
|
||||
top_p = 0.8
|
||||
temperature = 0.8
|
||||
batch_size = 6
|
||||
batch_threshold = 0.75
|
||||
text_split_method = "cut5"
|
||||
repetition_penalty = 1.4
|
||||
sample_steps = 150
|
||||
super_sampling = true
|
||||
|
||||
# 空间音效配置
|
||||
[spatial_effects]
|
||||
|
||||
# 是否启用空间音效处理
|
||||
enabled = false
|
||||
|
||||
# 是否启用标准混响效果
|
||||
reverb_enabled = false
|
||||
|
||||
# 混响的房间大小 (建议范围 0.0-1.0)
|
||||
room_size = 0.2
|
||||
|
||||
# 混响的阻尼/高频衰减 (建议范围 0.0-1.0)
|
||||
damping = 0.6
|
||||
|
||||
# 混响的湿声(效果声)比例 (建议范围 0.0-1.0)
|
||||
wet_level = 0.3
|
||||
|
||||
# 混响的干声(原声)比例 (建议范围 0.0-1.0)
|
||||
dry_level = 0.8
|
||||
|
||||
# 混响的立体声宽度 (建议范围 0.0-1.0)
|
||||
width = 1.0
|
||||
|
||||
# 是否启用卷积混响(需要assets/small_room_ir.wav文件)
|
||||
convolution_enabled = false
|
||||
|
||||
# 卷积混响的干湿比 (建议范围 0.0-1.0)
|
||||
convolution_mix = 0.7
|
||||
"""
|
||||
|
||||
try:
|
||||
config_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(config_file, "w", encoding="utf-8") as f:
|
||||
f.write(default_config_content.strip())
|
||||
logger.info("默认 TTS 配置文件创建成功。")
|
||||
logger.info("TTSVoicePlugin 初始化开始")
|
||||
super().__init__(*args, **kwargs)
|
||||
self.tts_service = None
|
||||
logger.info("TTSVoicePlugin 初始化完成")
|
||||
except Exception as e:
|
||||
logger.error(f"创建默认 TTS 配置文件失败: {e}")
|
||||
|
||||
def _get_config_wrapper(self, key: str, default: Any = None) -> Any:
|
||||
"""
|
||||
配置获取的包装器,用于解决 get_config 无法直接获取动态表(如 tts_styles)和未在 schema 中定义的节的问题。
|
||||
由于插件系统的 schema 为空时不会加载未定义的键,这里手动读取配置文件以获取所需配置。
|
||||
"""
|
||||
# 需要手动加载的顶级配置节
|
||||
manual_load_keys = ["tts_styles", "spatial_effects", "tts_advanced", "tts"]
|
||||
top_key = key.split(".")[0]
|
||||
|
||||
if top_key in manual_load_keys:
|
||||
try:
|
||||
plugin_file = Path(__file__).resolve()
|
||||
bot_root = plugin_file.parent.parent.parent.parent.parent
|
||||
config_file = bot_root / "config" / "plugins" / self.plugin_name / self.config_file_name
|
||||
|
||||
if not config_file.is_file():
|
||||
logger.error(f"TTS config file not found at robustly constructed path: {config_file}")
|
||||
return default
|
||||
|
||||
full_config = toml.loads(config_file.read_text(encoding="utf-8"))
|
||||
|
||||
# 支持点状路径访问
|
||||
value = full_config
|
||||
for k in key.split("."):
|
||||
if isinstance(value, dict):
|
||||
value = value.get(k)
|
||||
else:
|
||||
return default
|
||||
|
||||
return value if value is not None else default
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to manually load '{key}' from config: {e}")
|
||||
return default
|
||||
|
||||
return self.get_config(key, default)
|
||||
logger.error(f"TTSVoicePlugin 初始化失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
raise
|
||||
|
||||
async def on_plugin_loaded(self):
|
||||
"""
|
||||
插件加载完成后的回调,初始化并注册服务。
|
||||
"""
|
||||
logger.info("初始化 TTSVoicePlugin...")
|
||||
try:
|
||||
logger.info("开始初始化 TTSVoicePlugin...")
|
||||
|
||||
plugin_file = Path(__file__).resolve()
|
||||
bot_root = plugin_file.parent.parent.parent.parent.parent
|
||||
config_file = bot_root / "config" / "plugins" / self.plugin_name / self.config_file_name
|
||||
self._create_default_config(config_file)
|
||||
# 获取当前使用的TTS引擎
|
||||
engine = self.get_config("tts.engine", "gpt-sovits")
|
||||
logger.info(f"当前TTS引擎: {engine}")
|
||||
|
||||
# 实例化 TTSService,并传入 get_config 方法
|
||||
self.tts_service = TTSService(self._get_config_wrapper)
|
||||
if engine == "gpt-sovits":
|
||||
# 实例化 GPT-SoVITS 服务
|
||||
logger.info("初始化 GPT-SoVITS 服务...")
|
||||
self.tts_service = TTSService(self.get_config)
|
||||
register_service("tts", self.tts_service)
|
||||
logger.info("GPT-SoVITS TTSService 已成功初始化并注册。")
|
||||
|
||||
elif engine == "qwen-omni":
|
||||
# 检查API Key
|
||||
api_key = self.get_config("qwen_omni.api_key", "")
|
||||
if not api_key or api_key == "your-api-key-here":
|
||||
logger.error("Qwen Omni 需要配置有效的 API Key,请在插件配置中设置 qwen_omni.api_key")
|
||||
# 创建空服务,避免后续调用出错
|
||||
self.tts_service = None
|
||||
else:
|
||||
# 实例化 Qwen Omni 服务
|
||||
logger.info("初始化 Qwen Omni 服务...")
|
||||
self.tts_service = QwenOmniTTSModel(self.get_config)
|
||||
register_service("tts", self.tts_service)
|
||||
logger.info("Qwen Omni TTSModel 已成功初始化并注册。")
|
||||
else:
|
||||
logger.error(f"不支持的 TTS 引擎: {engine}")
|
||||
self.tts_service = None
|
||||
|
||||
# 注册服务
|
||||
register_service("tts", self.tts_service)
|
||||
logger.info("TTSService 已成功初始化并注册。")
|
||||
logger.info("TTSVoicePlugin 初始化完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TTSVoicePlugin 初始化过程中发生错误: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
# 不要重新抛出异常,避免影响主程序
|
||||
|
||||
def get_plugin_components(self) -> list[tuple[ComponentInfo, type]]:
|
||||
"""
|
||||
返回插件包含的组件列表。
|
||||
"""
|
||||
components = []
|
||||
if self.get_config("components.action_enabled", True):
|
||||
components.append((TTSVoiceAction.get_action_info(), TTSVoiceAction))
|
||||
if self.get_config("components.command_enabled", True):
|
||||
components.append((TTSVoiceCommand.get_plus_command_info(), TTSVoiceCommand))
|
||||
return components
|
||||
try:
|
||||
components = []
|
||||
if self.get_config("components.action_enabled", True):
|
||||
components.append((TTSVoiceAction.get_action_info(), TTSVoiceAction))
|
||||
if self.get_config("components.command_enabled", True):
|
||||
components.append((TTSVoiceCommand.get_plus_command_info(), TTSVoiceCommand))
|
||||
logger.info(f"加载了 {len(components)} 个组件")
|
||||
return components
|
||||
except Exception as e:
|
||||
logger.error(f"获取插件组件失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return []
|
||||
@@ -1,11 +1,12 @@
|
||||
"""
|
||||
TTS 核心服务
|
||||
TTS 核心服务 - GPT-SoVITS 专用
|
||||
"""
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
@@ -20,7 +21,7 @@ logger = get_logger("tts_voice_plugin.service")
|
||||
|
||||
|
||||
class TTSService:
|
||||
"""封装了TTS合成的核心逻辑"""
|
||||
"""GPT-SoVITS TTS 服务"""
|
||||
|
||||
def __init__(self, get_config_func: Callable[[str, Any], Any]):
|
||||
self.get_config = get_config_func
|
||||
@@ -37,133 +38,133 @@ class TTSService:
|
||||
self.tts_styles = self._load_tts_styles()
|
||||
|
||||
if self.tts_styles:
|
||||
logger.info(f"TTS服务已成功加载风格: {list(self.tts_styles.keys())}")
|
||||
logger.info(f"GPT-SoVITS服务已成功加载风格: {list(self.tts_styles.keys())}")
|
||||
else:
|
||||
logger.warning("TTS风格配置为空,请检查配置文件")
|
||||
except Exception as e:
|
||||
logger.error(f"TTS服务配置加载失败: {e}")
|
||||
logger.error(f"GPT-SoVITS服务配置加载失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
def _load_tts_styles(self) -> dict[str, dict[str, Any]]:
|
||||
"""加载 TTS 风格配置"""
|
||||
styles = {}
|
||||
global_server = self.get_config("tts.server", "http://127.0.0.1:9880")
|
||||
tts_styles_config = self.get_config("tts_styles", [])
|
||||
try:
|
||||
styles = {}
|
||||
global_server = self.get_config("tts.server", "http://127.0.0.1:9880")
|
||||
tts_styles_config = self.get_config("tts_styles", [])
|
||||
|
||||
if not isinstance(tts_styles_config, list):
|
||||
logger.error(f"tts_styles 配置不是一个列表, 而是 {type(tts_styles_config)}")
|
||||
if not isinstance(tts_styles_config, list):
|
||||
logger.error(f"tts_styles 配置不是一个列表, 而是 {type(tts_styles_config)}")
|
||||
return styles
|
||||
|
||||
default_cfg = next((s for s in tts_styles_config if s.get("style_name") == "default"), None)
|
||||
if not default_cfg:
|
||||
logger.error("在 tts_styles 配置中未找到 'default' 风格,这是必需的。")
|
||||
return styles
|
||||
|
||||
default_refer_wav = default_cfg.get("refer_wav_path", "")
|
||||
default_prompt_text = default_cfg.get("prompt_text", "")
|
||||
default_gpt_weights = default_cfg.get("gpt_weights", "")
|
||||
default_sovits_weights = default_cfg.get("sovits_weights", "")
|
||||
|
||||
if not default_refer_wav:
|
||||
logger.warning("TTS 'default' style is missing 'refer_wav_path'.")
|
||||
|
||||
for style_cfg in tts_styles_config:
|
||||
if not isinstance(style_cfg, dict):
|
||||
continue
|
||||
|
||||
style_name = style_cfg.get("style_name")
|
||||
if not style_name:
|
||||
continue
|
||||
|
||||
styles[style_name] = {
|
||||
"url": global_server,
|
||||
"name": style_cfg.get("name", style_name),
|
||||
"refer_wav_path": style_cfg.get("refer_wav_path", default_refer_wav),
|
||||
"prompt_text": style_cfg.get("prompt_text", default_prompt_text),
|
||||
"prompt_language": style_cfg.get("prompt_language", "zh"),
|
||||
"gpt_weights": style_cfg.get("gpt_weights", default_gpt_weights),
|
||||
"sovits_weights": style_cfg.get("sovits_weights", default_sovits_weights),
|
||||
"speed_factor": style_cfg.get("speed_factor"),
|
||||
"text_language": style_cfg.get("text_language", "auto"),
|
||||
}
|
||||
return styles
|
||||
|
||||
default_cfg = next((s for s in tts_styles_config if s.get("style_name") == "default"), None)
|
||||
if not default_cfg:
|
||||
logger.error("在 tts_styles 配置中未找到 'default' 风格,这是必需的。")
|
||||
return styles
|
||||
|
||||
default_refer_wav = default_cfg.get("refer_wav_path", "")
|
||||
default_prompt_text = default_cfg.get("prompt_text", "")
|
||||
default_gpt_weights = default_cfg.get("gpt_weights", "")
|
||||
default_sovits_weights = default_cfg.get("sovits_weights", "")
|
||||
|
||||
if not default_refer_wav:
|
||||
logger.warning("TTS 'default' style is missing 'refer_wav_path'.")
|
||||
|
||||
for style_cfg in tts_styles_config:
|
||||
if not isinstance(style_cfg, dict):
|
||||
|
||||
continue
|
||||
|
||||
style_name = style_cfg.get("style_name")
|
||||
if not style_name:
|
||||
|
||||
continue
|
||||
|
||||
styles[style_name] = {
|
||||
"url": global_server,
|
||||
"name": style_cfg.get("name", style_name),
|
||||
"refer_wav_path": style_cfg.get("refer_wav_path", default_refer_wav),
|
||||
"prompt_text": style_cfg.get("prompt_text", default_prompt_text),
|
||||
"prompt_language": style_cfg.get("prompt_language", "zh"),
|
||||
"gpt_weights": style_cfg.get("gpt_weights", default_gpt_weights),
|
||||
"sovits_weights": style_cfg.get("sovits_weights", default_sovits_weights),
|
||||
"speed_factor": style_cfg.get("speed_factor"),
|
||||
"text_language": style_cfg.get("text_language", "auto"), # 新增:读取文本语言模式
|
||||
}
|
||||
return styles
|
||||
except Exception as e:
|
||||
logger.error(f"加载TTS风格配置失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return {}
|
||||
|
||||
def _determine_final_language(self, text: str, mode: str) -> str:
|
||||
"""根据配置的语言策略和文本内容,决定最终发送给API的语言代码"""
|
||||
# 如果策略是具体的语言(如 all_zh, ja),直接使用
|
||||
if mode not in ["auto", "auto_yue"]:
|
||||
return mode
|
||||
try:
|
||||
if mode not in ["auto", "auto_yue"]:
|
||||
return mode
|
||||
|
||||
# 对于 auto 和 auto_yue 策略,进行内容检测
|
||||
# 优先检测粤语
|
||||
if mode == "auto_yue":
|
||||
cantonese_keywords = ["嘅", "喺", "咗", "唔", "係", "啲", "咩", "乜", "喂"]
|
||||
if any(keyword in text for keyword in cantonese_keywords):
|
||||
logger.info("在 auto_yue 模式下检测到粤语关键词,最终语言: yue")
|
||||
return "yue"
|
||||
if mode == "auto_yue":
|
||||
cantonese_keywords = ["嘅", "喺", "咗", "唔", "係", "啲", "咩", "乜", "喂"]
|
||||
if any(keyword in text for keyword in cantonese_keywords):
|
||||
logger.info("在 auto_yue 模式下检测到粤语关键词,最终语言: yue")
|
||||
return "yue"
|
||||
|
||||
# 检测日语(简单启发式规则)
|
||||
japanese_chars = len(re.findall(r"[\u3040-\u309f\u30a0-\u30ff]", text))
|
||||
if japanese_chars > 5 and japanese_chars > len(re.findall(r"[\u4e00-\u9fff]", text)) * 0.5:
|
||||
logger.info("检测到日语字符,最终语言: ja")
|
||||
return "ja"
|
||||
japanese_chars = len(re.findall(r"[\u3040-\u309f\u30a0-\u30ff]", text))
|
||||
if japanese_chars > 5 and japanese_chars > len(re.findall(r"[\u4e00-\u9fff]", text)) * 0.5:
|
||||
logger.info("检测到日语字符,最终语言: ja")
|
||||
return "ja"
|
||||
|
||||
# 默认回退到中文
|
||||
logger.info(f"在 {mode} 模式下未检测到特定语言,默认回退到: zh")
|
||||
return "zh"
|
||||
logger.info(f"在 {mode} 模式下未检测到特定语言,默认回退到: zh")
|
||||
return "zh"
|
||||
except Exception as e:
|
||||
logger.error(f"语言检测失败: {e}")
|
||||
return "zh"
|
||||
|
||||
def _clean_text_for_tts(self, text: str) -> str:
|
||||
# 1. 基本清理
|
||||
text = re.sub(r"[\((\[【].*?[\))\]】]", "", text)
|
||||
text = re.sub(r"([,。!?、;:,.!?;:~\-`])\1+", r"\1", text)
|
||||
text = re.sub(r"~{2,}|~{2,}", ",", text)
|
||||
text = re.sub(r"\.{3,}|…{1,}", "。", text)
|
||||
|
||||
# 2. 词语替换
|
||||
replacements = {"www": "哈哈哈", "hhh": "哈哈", "233": "哈哈", "666": "厉害", "88": "拜拜"}
|
||||
for old, new in replacements.items():
|
||||
text = text.replace(old, new)
|
||||
|
||||
# 3. 移除不必要的字符 (恢复使用更安全的原版正则,避免误删)
|
||||
text = re.sub(r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,。!?、;:,.!?;:~~]", "", text)
|
||||
|
||||
# 4. 确保结尾有标点
|
||||
if text and not text.endswith(tuple(",。!?、;:,.!?;:")):
|
||||
text += "。"
|
||||
|
||||
# 5. 智能截断 (保留改进的截断逻辑)
|
||||
if len(text) > self.max_text_length:
|
||||
cut_text = text[:self.max_text_length]
|
||||
punctuation = "。!?.…"
|
||||
last_punc_pos = max(cut_text.rfind(p) for p in punctuation)
|
||||
|
||||
if last_punc_pos != -1:
|
||||
text = cut_text[:last_punc_pos + 1]
|
||||
else:
|
||||
last_comma_pos = max(cut_text.rfind(p) for p in ",、;,;")
|
||||
if last_comma_pos != -1:
|
||||
text = cut_text[:last_comma_pos + 1]
|
||||
else:
|
||||
text = cut_text
|
||||
|
||||
return text.strip()
|
||||
|
||||
async def _call_tts_api(self, server_config: dict, text: str, text_language: str, **kwargs) -> bytes | None:
|
||||
"""
|
||||
最终修复版:先切换模型,然后仅通过路径发送合成请求。
|
||||
"""
|
||||
ref_wav_path = kwargs.get("refer_wav_path")
|
||||
if not ref_wav_path:
|
||||
logger.error(f"API 调用失败:缺少 refer_wav_path。当前风格配置: {server_config}")
|
||||
return None
|
||||
"""清理文本,使其适合TTS"""
|
||||
try:
|
||||
text = re.sub(r"[\((\[【].*?[\))\]】]", "", text)
|
||||
text = re.sub(r"([,。!?、;:,.!?;:~\-`])\1+", r"\1", text)
|
||||
text = re.sub(r"~{2,}|~{2,}", ",", text)
|
||||
text = re.sub(r"\.{3,}|…{1,}", "。", text)
|
||||
|
||||
replacements = {"www": "哈哈哈", "hhh": "哈哈", "233": "哈哈", "666": "厉害", "88": "拜拜"}
|
||||
for old, new in replacements.items():
|
||||
text = text.replace(old, new)
|
||||
|
||||
text = re.sub(r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,。!?、;:,.!?;:~~]", "", text)
|
||||
|
||||
if text and not text.endswith(tuple(",。!?、;:,.!?;:")):
|
||||
text += "。"
|
||||
|
||||
if len(text) > self.max_text_length:
|
||||
cut_text = text[:self.max_text_length]
|
||||
punctuation = "。!?.…"
|
||||
last_punc_pos = max(cut_text.rfind(p) for p in punctuation)
|
||||
|
||||
if last_punc_pos != -1:
|
||||
text = cut_text[:last_punc_pos + 1]
|
||||
else:
|
||||
last_comma_pos = max(cut_text.rfind(p) for p in ",、;,;")
|
||||
if last_comma_pos != -1:
|
||||
text = cut_text[:last_comma_pos + 1]
|
||||
else:
|
||||
text = cut_text
|
||||
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
logger.error(f"文本清理失败: {e}")
|
||||
return text
|
||||
|
||||
async def _call_gpt_sovits_api(self, server_config: dict, text: str, text_language: str, **kwargs) -> bytes | None:
|
||||
"""调用 GPT-SoVITS API"""
|
||||
try:
|
||||
ref_wav_path = kwargs.get("refer_wav_path")
|
||||
if not ref_wav_path:
|
||||
logger.error(f"API 调用失败:缺少 refer_wav_path。当前风格配置: {server_config}")
|
||||
return None
|
||||
|
||||
base_url = server_config["url"].rstrip("/")
|
||||
|
||||
# --- 步骤一:像稳定版一样,先切换模型 ---
|
||||
async def switch_model_weights(weights_path: str | None, weight_type: str):
|
||||
if not weights_path:
|
||||
|
||||
return
|
||||
api_endpoint = f"/set_{weight_type}_weights"
|
||||
switch_url = f"{base_url}{api_endpoint}"
|
||||
@@ -181,30 +182,23 @@ class TTSService:
|
||||
await switch_model_weights(kwargs.get("gpt_weights"), "gpt")
|
||||
await switch_model_weights(kwargs.get("sovits_weights"), "sovits")
|
||||
|
||||
# --- 步骤二:构建纯净的、不含Base64的请求数据 ---
|
||||
data = {
|
||||
"text": text,
|
||||
"text_lang": text_language,
|
||||
"ref_audio_path": ref_wav_path,
|
||||
"prompt_text": kwargs.get("prompt_text", ""),
|
||||
"prompt_lang": kwargs.get("prompt_language", "zh"),
|
||||
# 在稳定版中,这两个参数是通过API切换的,而不是直接放在请求体里
|
||||
# "gpt_model_path": kwargs.get("gpt_weights"),
|
||||
# "sovits_model_path": kwargs.get("sovits_weights"),
|
||||
}
|
||||
|
||||
# 合并高级配置
|
||||
advanced_config = self.get_config("tts_advanced", {})
|
||||
if isinstance(advanced_config, dict):
|
||||
data.update({k: v for k, v in advanced_config.items() if v is not None})
|
||||
|
||||
# 优先使用风格特定的语速
|
||||
if server_config.get("speed_factor") is not None:
|
||||
data["speed_factor"] = server_config["speed_factor"]
|
||||
|
||||
# --- 步骤三:发送最终的合成请求 ---
|
||||
tts_url = base_url if base_url.endswith("/tts") else f"{base_url}/tts"
|
||||
logger.info(f"发送到 TTS API 的数据: {data}")
|
||||
logger.info(f"发送到 GPT-SoVITS API 的数据: {data}")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(tts_url, json=data, timeout=aiohttp.ClientTimeout(total=self.timeout)) as response:
|
||||
@@ -212,13 +206,14 @@ class TTSService:
|
||||
return await response.read()
|
||||
else:
|
||||
error_info = await response.text()
|
||||
logger.error(f"TTS API调用失败: {response.status} - {error_info}")
|
||||
logger.error(f"GPT-SoVITS API调用失败: {response.status} - {error_info}")
|
||||
return None
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("TTS服务请求超时")
|
||||
logger.error("GPT-SoVITS服务请求超时")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"TTS API调用异常: {e}")
|
||||
logger.error(f"GPT-SoVITS API调用异常: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return None
|
||||
|
||||
async def _apply_spatial_audio_effect(self, audio_data: bytes) -> bytes | None:
|
||||
@@ -226,19 +221,14 @@ class TTSService:
|
||||
try:
|
||||
effects_config = self.get_config("spatial_effects", {})
|
||||
if not effects_config.get("enabled", False):
|
||||
|
||||
return audio_data
|
||||
|
||||
# 获取插件目录和IR文件路径
|
||||
# 基于 __file__ 构建稳健的、独立于当前工作目录的路径
|
||||
plugin_file = os.path.abspath(__file__)
|
||||
# services -> tts_voice_plugin -> plugins -> Bot
|
||||
bot_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(plugin_file))))
|
||||
ir_path = os.path.join(bot_root, "assets", "small_room_ir.wav")
|
||||
|
||||
effects = []
|
||||
|
||||
# 根据配置添加Reverb效果
|
||||
if effects_config.get("reverb_enabled", False):
|
||||
effects.append(Reverb(
|
||||
room_size=effects_config.get("room_size", 0.15),
|
||||
@@ -248,7 +238,6 @@ class TTSService:
|
||||
width=effects_config.get("width", 1.0)
|
||||
))
|
||||
|
||||
# 根据配置添加Convolution效果
|
||||
if effects_config.get("convolution_enabled", False) and os.path.exists(ir_path):
|
||||
effects.append(Convolution(
|
||||
impulse_response_filename=ir_path,
|
||||
@@ -258,19 +247,14 @@ class TTSService:
|
||||
logger.warning(f"卷积混响已启用,但IR文件不存在 ({ir_path}),跳过该效果。")
|
||||
|
||||
if not effects:
|
||||
|
||||
|
||||
return audio_data
|
||||
|
||||
# 将原始音频数据加载到内存中的 AudioFile 对象
|
||||
with io.BytesIO(audio_data) as audio_stream:
|
||||
with AudioFile(audio_stream, "r") as f:
|
||||
board = Pedalboard(effects)
|
||||
effected = board(f.read(f.frames), f.samplerate)
|
||||
|
||||
# 将处理后的音频数据写回内存中的字节流
|
||||
with io.BytesIO() as output_stream:
|
||||
# 使用 soundfile 写入,因为它更稳定
|
||||
sf.write(output_stream, effected.T, f.samplerate, format="WAV")
|
||||
processed_audio_data = output_stream.getvalue()
|
||||
|
||||
@@ -279,66 +263,69 @@ class TTSService:
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"应用空间效果时出错: {e}")
|
||||
return audio_data # 如果出错,返回原始音频
|
||||
logger.error(traceback.format_exc())
|
||||
return audio_data
|
||||
|
||||
async def generate_voice(self, text: str, style_hint: str = "default", language_hint: str | None = None) -> str | None:
|
||||
self._load_config()
|
||||
"""生成语音 - GPT-SoVITS 专用"""
|
||||
try:
|
||||
self._load_config()
|
||||
|
||||
if not self.tts_styles:
|
||||
logger.error("TTS风格配置为空,无法生成语音。")
|
||||
return None
|
||||
|
||||
style = style_hint if style_hint in self.tts_styles else "default"
|
||||
if style not in self.tts_styles:
|
||||
if "default" in self.tts_styles:
|
||||
style = "default"
|
||||
logger.warning(f"指定风格 '{style_hint}' 不存在,自动回退到: 'default'")
|
||||
elif self.tts_styles:
|
||||
style = next(iter(self.tts_styles))
|
||||
logger.warning(f"指定风格 '{style_hint}' 和 'default' 均不存在,自动回退到第一个可用风格: {style}")
|
||||
else:
|
||||
logger.error("没有任何可用的TTS风格配置")
|
||||
if not self.tts_styles:
|
||||
logger.error("TTS风格配置为空,无法生成语音。")
|
||||
return None
|
||||
|
||||
server_config = self.tts_styles[style]
|
||||
clean_text = self._clean_text_for_tts(text)
|
||||
if not clean_text:
|
||||
style = style_hint if style_hint in self.tts_styles else "default"
|
||||
if style not in self.tts_styles:
|
||||
if "default" in self.tts_styles:
|
||||
style = "default"
|
||||
logger.warning(f"指定风格 '{style_hint}' 不存在,自动回退到: 'default'")
|
||||
elif self.tts_styles:
|
||||
style = next(iter(self.tts_styles))
|
||||
logger.warning(f"指定风格 '{style_hint}' 和 'default' 均不存在,自动回退到第一个可用风格: {style}")
|
||||
else:
|
||||
logger.error("没有任何可用的TTS风格配置")
|
||||
return None
|
||||
|
||||
server_config = self.tts_styles[style]
|
||||
clean_text = self._clean_text_for_tts(text)
|
||||
if not clean_text:
|
||||
return None
|
||||
|
||||
if language_hint:
|
||||
final_language = language_hint
|
||||
logger.info(f"使用决策模型指定的语言: {final_language}")
|
||||
else:
|
||||
language_policy = server_config.get("text_language", "auto")
|
||||
final_language = self._determine_final_language(clean_text, language_policy)
|
||||
logger.info(f"决策模型未指定语言,使用策略 '{language_policy}' -> 最终语言: {final_language}")
|
||||
|
||||
logger.info(f"开始GPT-SoVITS语音合成,文本:{clean_text[:50]}..., 风格:{style}, 最终语言: {final_language}")
|
||||
|
||||
audio_data = await self._call_gpt_sovits_api(
|
||||
server_config=server_config, text=clean_text, text_language=final_language,
|
||||
refer_wav_path=server_config.get("refer_wav_path"),
|
||||
prompt_text=server_config.get("prompt_text"),
|
||||
prompt_language=server_config.get("prompt_language"),
|
||||
gpt_weights=server_config.get("gpt_weights"),
|
||||
sovits_weights=server_config.get("sovits_weights"),
|
||||
)
|
||||
|
||||
if audio_data:
|
||||
spatial_config = self.get_config("spatial_effects", {})
|
||||
if spatial_config.get("enabled", False):
|
||||
logger.info("检测到已启用空间音频效果,开始处理...")
|
||||
processed_audio = await self._apply_spatial_audio_effect(audio_data)
|
||||
if processed_audio:
|
||||
logger.info("空间音频效果应用成功!")
|
||||
audio_data = processed_audio
|
||||
else:
|
||||
logger.warning("空间音频效果应用失败,将使用原始音频。")
|
||||
|
||||
return base64.b64encode(audio_data).decode("utf-8")
|
||||
return None
|
||||
|
||||
# 语言决策流程:
|
||||
# 1. 优先使用决策模型直接指定的 language_hint (最高优先级)
|
||||
if language_hint:
|
||||
final_language = language_hint
|
||||
logger.info(f"使用决策模型指定的语言: {final_language}")
|
||||
else:
|
||||
# 2. 如果模型未指定,则使用风格配置的 language_policy
|
||||
language_policy = server_config.get("text_language", "auto")
|
||||
final_language = self._determine_final_language(clean_text, language_policy)
|
||||
logger.info(f"决策模型未指定语言,使用策略 '{language_policy}' -> 最终语言: {final_language}")
|
||||
|
||||
logger.info(f"开始TTS语音合成,文本:{clean_text[:50]}..., 风格:{style}, 最终语言: {final_language}")
|
||||
|
||||
audio_data = await self._call_tts_api(
|
||||
server_config=server_config, text=clean_text, text_language=final_language,
|
||||
refer_wav_path=server_config.get("refer_wav_path"),
|
||||
prompt_text=server_config.get("prompt_text"),
|
||||
prompt_language=server_config.get("prompt_language"),
|
||||
gpt_weights=server_config.get("gpt_weights"),
|
||||
sovits_weights=server_config.get("sovits_weights"),
|
||||
)
|
||||
|
||||
if audio_data:
|
||||
# 检查是否启用空间音频效果
|
||||
spatial_config = self.get_config("spatial_effects", {})
|
||||
if spatial_config.get("enabled", False):
|
||||
logger.info("检测到已启用空间音频效果,开始处理...")
|
||||
processed_audio = await self._apply_spatial_audio_effect(audio_data)
|
||||
if processed_audio:
|
||||
logger.info("空间音频效果应用成功!")
|
||||
audio_data = processed_audio
|
||||
else:
|
||||
logger.warning("空间音频效果应用失败,将使用原始音频。")
|
||||
|
||||
return base64.b64encode(audio_data).decode("utf-8")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"语音合成失败: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return None
|
||||
Reference in New Issue
Block a user