feat:为tts增加Qwen-Omni接口

This commit is contained in:
mcn1630
2025-11-28 17:24:05 +08:00
parent e65beb2767
commit 2a4f73859d
5 changed files with 648 additions and 419 deletions

View File

@@ -2,17 +2,17 @@ from src.plugin_system.base.plugin_metadata import PluginMetadata
__plugin_meta__ = PluginMetadata(
name="GPT-SoVITS 语音合成插件",
description="基于 GPT-SoVITS 的文本转语音插件,支持多种语言和多风格语音合成。",
description="基于 GPT-SoVITS 和 Qwen Omni 的文本转语音插件,支持多种语言和多风格语音合成。",
usage=" ",
version="2.0.0",
author="靓仔",
version="3.2.0",
author="靓仔 & AI助手",
license="AGPL-v3.0",
repository_url="https://github.com/xuqian13/tts_voice_plugin",
keywords=["tts", "语音合成", "文本转语音", "gpt-sovits", "语音", "朗读", "多风格", "语音播报"],
keywords=["tts", "语音合成", "文本转语音", "gpt-sovits", "qwen-omni", "语音", "朗读", "多风格", "语音播报"],
categories=["Utility", "Communication", "Accessibility"],
extra={
"is_built_in": False,
"plugin_type": "tools",
},
python_dependencies = ["aiohttp", "soundfile", "pedalboard"]
)
python_dependencies = ["aiohttp", "soundfile", "pedalboard", "openai", "toml", "numpy"]
)

View File

@@ -4,6 +4,7 @@ TTS 语音合成 Action
from pathlib import Path
from typing import ClassVar
import traceback
import toml
@@ -16,38 +17,134 @@ from ..services.manager import get_service
logger = get_logger("tts_voice_plugin.action")
def _create_default_config(config_file: Path) -> bool:
"""创建默认配置文件"""
try:
# 确保配置目录存在
config_file.parent.mkdir(parents=True, exist_ok=True)
default_config = {
"plugin": {
"enabled": True,
"debug": False
},
"components": {
"action_enabled": True,
"command_enabled": True
},
"tts": {
"engine": "qwen-omni",
"server": "http://127.0.0.1:9880",
"timeout": 60,
"max_text_length": 500
},
"qwen_omni": {
"api_key": "your-api-key-here",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model_name": "qwen-omni-turbo",
"voice_character": "Chelsie",
"media_format": "wav"
},
"tts_advanced": {
"top_k": 5,
"top_p": 1.0,
"temperature": 1.0,
"batch_size": 1,
"split_bucket": True
},
"spatial_effects": {
"enabled": False,
"reverb_enabled": True,
"room_size": 0.15,
"damping": 0.5,
"wet_level": 0.33,
"dry_level": 0.4,
"width": 1.0,
"convolution_enabled": False,
"convolution_mix": 0.5
},
"tts_styles": [
{
"style_name": "default",
"name": "默认风格",
"refer_wav_path": "/path/to/your/reference.wav",
"prompt_text": "这是一个示例文本,请替换为您自己的参考音频文本。",
"prompt_language": "zh",
"gpt_weights": "/path/to/your/gpt_weights.pth",
"sovits_weights": "/path/to/your/sovits_weights.pth",
"speed_factor": 1.0,
"text_language": "auto"
}
]
}
with open(config_file, 'w', encoding='utf-8') as f:
toml.dump(default_config, f)
logger.info(f"已创建默认配置文件: {config_file}")
return True
except Exception as e:
logger.error(f"创建默认配置文件失败: {e}", exc_info=True)
return False
def _get_available_styles() -> list[str]:
"""动态读取配置文件获取所有可用的TTS风格名称"""
try:
# 这个路径构建逻辑是为了确保无论从哪里启动,都能准确定位到配置文件
# 使用更稳健的路径构建方法
plugin_file = Path(__file__).resolve()
# Bot/src/plugins/built_in/tts_voice_plugin/actions -> Bot
bot_root = plugin_file.parent.parent.parent.parent.parent.parent
config_file = bot_root / "config" / "plugins" / "tts_voice_plugin" / "config.toml"
if not config_file.is_file():
logger.warning("在 tts_action 中未找到 tts_voice_plugin 的配置文件,无法动态加载风格列表。")
# 计算插件根目录: Bot/src/plugins/built_in/tts_voice_plugin/actions -> Bot/src/plugins/built_in/tts_voice_plugin
plugin_root = plugin_file.parent.parent
# 尝试多种可能的配置路径
possible_paths = [
# 标准路径: Bot/config/plugins/tts_voice_plugin/config.toml
plugin_root.parent.parent.parent.parent / "config" / "plugins" / "tts_voice_plugin" / "config.toml",
# 备用路径: Bot/config/plugins/tts_voice_plugin/config.toml
plugin_root.parent.parent.parent / "config" / "plugins" / "tts_voice_plugin" / "config.toml",
# 开发路径: 直接在插件目录下的 config.toml
plugin_root / "config.toml"
]
config_file = None
for path in possible_paths:
if path.is_file():
config_file = path
break
if not config_file or not config_file.is_file():
logger.warning("配置文件不存在,使用默认风格列表")
return ["default"]
config = toml.loads(config_file.read_text(encoding="utf-8"))
styles_config = config.get("tts_styles", [])
if not isinstance(styles_config, list):
# 检查当前使用的 TTS 引擎
engine = config.get("tts", {}).get("engine", "gpt-sovits")
if engine == "qwen-omni":
# Qwen Omni 使用默认风格
return ["default"]
else:
# GPT-SoVITS 从配置中读取风格
styles_config = config.get("tts_styles", [])
if not isinstance(styles_config, list):
logger.warning(f"tts_styles 配置不是列表类型: {type(styles_config)}")
return ["default"]
# 使用显式循环和类型检查来提取 style_name,以确保 Pylance 类型检查通过
style_names: list[str] = []
for style in styles_config:
if isinstance(style, dict):
name = style.get("style_name")
# 确保 name 是一个非空字符串
if isinstance(name, str) and name:
style_names.append(name)
# 使用显式循环和类型检查来提取 style_name
style_names: list[str] = []
for style in styles_config:
if isinstance(style, dict):
name = style.get("style_name")
# 确保 name 是一个非空字符串
if isinstance(name, str) and name:
style_names.append(name)
return style_names if style_names else ["default"]
return style_names if style_names else ["default"]
except Exception as e:
logger.error(f"动态加载TTS风格列表时出错: {e}")
logger.error(f"动态加载TTS风格列表时出错: {e}", exc_info=True)
return ["default"] # 出现任何错误都回退
@@ -68,7 +165,7 @@ class TTSVoiceAction(BaseAction):
parallel_action = False
action_parameters: ClassVar[dict] = {
"tts_voice_text": {
"text": {
"type": "string",
"description": "需要转换为语音并发送的完整、自然、适合口语的文本内容。",
"required": True
@@ -101,7 +198,7 @@ class TTSVoiceAction(BaseAction):
action_require: ClassVar[list] = [
"在调用此动作时,你必须在 'text' 参数中提供要合成语音的完整回复内容。这是强制性的。",
"当用户明确请求使用语音进行回复时,例如发个语音听听’、‘用语音说’等。",
"当用户明确请求使用语音进行回复时,例如'发个语音听听''用语音说'等。",
"当对话内容适合用语音表达,例如讲故事、念诗、撒嬌或进行角色扮演时。",
"在表达特殊情感(如安慰、鼓励、庆祝)的场景下,可以主动使用语音来增强感染力。",
"不要在日常的、简短的问答或闲聊中频繁使用语音,避免打扰用户。",
@@ -119,34 +216,35 @@ class TTSVoiceAction(BaseAction):
"""
判断此 Action 是否应该被激活。
满足以下任一条件即可激活:
1. 55% 的随机概率
1. 25% 的随机概率
2. 匹配到预设的关键词
3. LLM 判断当前场景适合发送语音
"""
# 条件1: 随机激活
if await self._random_activation(0.25):
logger.info(f"{self.log_prefix} 随机激活成功 (25%)")
return True
try:
# 条件1: 随机激活
if await self._random_activation(0.25):
logger.info(f"{self.log_prefix} 随机激活成功 (25%)")
return True
# 条件2: 关键词激活
keywords = [
"发语音", "语音", "说句话", "用语音说", "听你", "听声音", "想你", "想听声音",
"讲个话", "说段话", "念一下", "读一下", "用嘴说", "", "能发语音吗", "亲口"
]
if await self._keyword_match(keywords):
logger.info(f"{self.log_prefix} 关键词激活成功")
return True
# 条件2: 关键词激活
keywords = [
"发语音", "语音", "说句话", "用语音说", "听你", "听声音", "想你", "想听声音",
"讲个话", "说段话", "念一下", "读一下", "用嘴说", "", "能发语音吗", "亲口"
]
if await self._keyword_match(keywords):
logger.info(f"{self.log_prefix} 关键词激活成功")
return True
# 条件3: LLM 判断激活
# 注意:这里我们复用 action_require 里的描述,让 LLM 的判断更精准
if await self._llm_judge_activation(
llm_judge_model=llm_judge_model
):
logger.info(f"{self.log_prefix} LLM 判断激活成功")
return True
# 条件3: LLM 判断激活
if await self._llm_judge_activation(llm_judge_model=llm_judge_model):
logger.info(f"{self.log_prefix} LLM 判断激活成功")
return True
logger.debug(f"{self.log_prefix} 所有激活条件均未满足,不激活")
return False
logger.debug(f"{self.log_prefix} 所有激活条件均未满足,不激活")
return False
except Exception as e:
logger.error(f"{self.log_prefix} 激活判断失败: {e}")
return False
async def execute(self) -> tuple[bool, str]:
"""
@@ -157,36 +255,38 @@ class TTSVoiceAction(BaseAction):
logger.error(f"{self.log_prefix} TTSService 未注册或初始化失败,静默处理。")
return False, "TTSService 未注册或初始化失败"
initial_text = self.action_data.get("tts_voice_text", "").strip()
# 尝试多种可能的参数名
initial_text = self.action_data.get("text", "").strip()
if not initial_text:
initial_text = self.action_data.get("tts_voice_text", "").strip()
voice_style = self.action_data.get("voice_style", "default")
# 新增:从决策模型获取指定的语言模式
text_language = self.action_data.get("text_language") # 如果模型没给,就是 None
logger.info(f"{self.log_prefix} 接收到规划器初步文本: '{initial_text[:70]}...', 指定风格: {voice_style}, 指定语言: {text_language}")
text_language = self.action_data.get("text_language")
logger.info(f"{self.log_prefix} 接收到规划器文本: '{initial_text[:70]}...', 风格: {voice_style}, 语言: {text_language}")
# 1. 使用规划器提供的文本
text = initial_text
if not text:
if not initial_text:
logger.warning(f"{self.log_prefix} 规划器提供的文本为空,静默处理。")
return False, "规划器提供的文本为空"
# 2. 调用 TTSService 生成语音
logger.info(f"{self.log_prefix} 使用最终文本进行语音合成: '{text[:70]}...'")
# 调用 TTSService 生成语音
logger.info(f"{self.log_prefix} 使用最终文本进行语音合成: '{initial_text[:70]}...'")
audio_b64 = await self.tts_service.generate_voice(
text=text,
text=initial_text,
style_hint=voice_style,
language_hint=text_language # 新增:将决策模型指定的语言传递给服务
language_hint=text_language
)
if audio_b64:
# 在发送语音前,将文本注册到缓存中
register_self_voice(audio_b64, text)
register_self_voice(audio_b64, initial_text)
await self.send_custom(message_type="voice", content=audio_b64)
logger.info(f"{self.log_prefix} GPT-SoVITS语音发送成功")
logger.info(f"{self.log_prefix} 语音发送成功")
await self.store_action_info(
action_prompt_display=f"将文本转换为语音并发送 (风格:{voice_style})",
action_done=True
)
return True, f"成功生成并发送语音,文本长度: {len(text)}字符"
return True, f"成功生成并发送语音,文本长度: {len(initial_text)}字符"
else:
logger.error(f"{self.log_prefix} TTS服务未能返回音频数据静默处理。")
await self.store_action_info(
@@ -197,9 +297,9 @@ class TTSVoiceAction(BaseAction):
except Exception as e:
logger.error(f"{self.log_prefix} 语音合成过程中发生未知错误: {e!s}")
logger.error(traceback.format_exc())
await self.store_action_info(
action_prompt_display=f"语音合成失败: {e!s}",
action_done=False
)
return False, f"语音合成出错: {e!s}"
return False, f"语音合成出错: {e!s}"

View File

@@ -19,7 +19,7 @@ class TTSVoiceCommand(PlusCommand):
"""
command_name: str = "tts"
command_description: str = "使用GPT-SoVITS将文本转换为语音并发送"
command_description: str = "使用GPT-SoVITS或Qwen Omni将文本转换为语音并发送"
command_aliases: ClassVar[list[str]] = ["语音合成", ""]
command_usage = "/tts <要说的文本> [风格]"
@@ -41,8 +41,14 @@ class TTSVoiceCommand(PlusCommand):
if not tts_service:
raise RuntimeError("TTSService 未注册或初始化失败")
# 获取可用风格列表
available_styles = tts_service.tts_styles.keys()
# 获取可用风格列表 - 兼容不同的 TTS 服务类型
available_styles = []
if hasattr(tts_service, 'tts_styles'):
# GPT-SoVITS 服务
available_styles = list(tts_service.tts_styles.keys())
else:
# Qwen Omni 服务 - 使用默认风格
available_styles = ["default"]
text_to_speak = ""
style_hint = "default"
@@ -61,7 +67,11 @@ class TTSVoiceCommand(PlusCommand):
await self.send_text("请提供要转换为语音的文本内容哦!")
return False, "文本内容为空", True
audio_b64 = await tts_service.generate_voice(text_to_speak, style_hint)
# 调用 TTS 服务生成语音
audio_b64 = await tts_service.generate_voice(
text=text_to_speak,
style_hint=style_hint
)
if audio_b64:
await self.send_type(message_type="voice", content=audio_b64)
@@ -73,4 +83,4 @@ class TTSVoiceCommand(PlusCommand):
except Exception as e:
logger.error(f"执行 /tts 命令时出错: {e}")
await self.send_text("❌ 语音合成时发生了意想不到的错误,请查看日志。")
return False, "命令执行异常", True
return False, "命令执行异常", True

View File

@@ -1,14 +1,22 @@
"""
TTS Voice 插件 - 重构版
"""
import base64
import io
import traceback
from dataclasses import dataclass
from pathlib import Path
from typing import Any, ClassVar
from typing import Any, ClassVar, Dict, AsyncIterator
import toml
import numpy as np
import soundfile as sf
from openai import OpenAI
from src.common.logger import get_logger
from src.plugin_system import BasePlugin, ComponentInfo, register_plugin
from src.plugin_system.base.component_types import PermissionNodeField
from src.plugin_system.base.config_types import ConfigField
from .actions.tts_action import TTSVoiceAction
from .commands.tts_command import TTSVoiceCommand
@@ -17,18 +25,191 @@ from .services.tts_service import TTSService
logger = get_logger("tts_voice_plugin")
@dataclass
class QwenOmniConfig:
"""Qwen Omni TTS 配置"""
api_key: str
model_name: str = "qwen-omni-turbo"
voice_character: str = "Chelsie"
media_format: str = "wav"
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "QwenOmniConfig":
return cls(
api_key=data.get("api_key", ""),
model_name=data.get("model_name", "qwen-omni-turbo"),
voice_character=data.get("voice_character", "Chelsie"),
media_format=data.get("media_format", "wav"),
base_url=data.get("base_url", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
)
class QwenOmniTTSModel:
"""Qwen Omni TTS 模型"""
def __init__(self, get_config_func):
"""初始化TTS模型
Args:
get_config_func: 插件配置获取函数
"""
self.get_config = get_config_func
self.config = self._load_config()
def _load_config(self) -> QwenOmniConfig:
"""从插件配置加载Qwen Omni配置"""
try:
config_data = {
"api_key": self.get_config("qwen_omni.api_key", ""),
"model_name": self.get_config("qwen_omni.model_name", "qwen-omni-turbo"),
"voice_character": self.get_config("qwen_omni.voice_character", "Chelsie"),
"media_format": self.get_config("qwen_omni.media_format", "wav"),
"base_url": self.get_config("qwen_omni.base_url", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
}
return QwenOmniConfig.from_dict(config_data)
except Exception as e:
logger.error(f"加载 Qwen Omni 配置失败: {e}")
return QwenOmniConfig(api_key="")
async def tts(self, text: str, **kwargs) -> bytes:
"""文本转语音 - 将PCM数据转换为WAV文件"""
try:
audio_base64_string = ""
chunk_count = 0
async for chunk in self._tts_stream(text, **kwargs):
audio_base64_string += chunk
chunk_count += 1
if not audio_base64_string:
logger.error("没有收到任何音频数据")
return None
# 解码base64得到PCM数据
pcm_data = base64.b64decode(audio_base64_string)
# 将PCM数据转换为WAV文件
wav_bytes = self._pcm_to_wav_soundfile(pcm_data)
return wav_bytes
except Exception as e:
logger.error(f"Qwen Omni TTS 失败: {e}")
logger.error(traceback.format_exc())
return None
def _pcm_to_wav_soundfile(self, pcm_data: bytes, sample_rate: int = 24000, channels: int = 1) -> bytes:
"""使用soundfile将PCM数据转换为WAV文件"""
try:
import io
import numpy as np
# 将PCM字节数据转换为numpy数组
# 假设是16位有符号整数这是最常见的PCM格式
audio_array = np.frombuffer(pcm_data, dtype=np.int16)
# 创建字节流
wav_io = io.BytesIO()
# 使用soundfile写入WAV格式
sf.write(wav_io, audio_array, sample_rate, format='WAV')
# 获取WAV文件数据
wav_bytes = wav_io.getvalue()
wav_io.close()
logger.info(f"使用soundfile转换PCM到WAV: {len(pcm_data)}字节PCM -> {len(wav_bytes)}字节WAV")
return wav_bytes
except Exception as e:
logger.error(f"使用soundfile转换PCM到WAV失败: {e}")
logger.error(traceback.format_exc())
return pcm_data
async def _tts_stream(self, text: str, **kwargs) -> AsyncIterator[str]:
"""使用大模型流式生成音频数据"""
try:
logger.info(f"开始调用Qwen Omni API生成音频文本: {text[:30]}{'...' if len(text) > 30 else ''}")
prompt = f"复述这句话,不要输出其他内容,只输出'{text}'就好,不要输出其他内容,不要输出前后缀,不要输出'{text}'以外的内容,不要说:如果还有类似的需求或者想聊聊别的"
logger.info(f"使用prompt: {prompt}")
client = OpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
completion = client.chat.completions.create(
model=self.config.model_name,
messages=[{"role": "user", "content": prompt}],
modalities=["text", "audio"],
audio={
"voice": self.config.voice_character,
"format": self.config.media_format,
},
stream=True,
stream_options={"include_usage": True},
)
audio_data_received = False
total_audio_length = 0
for chunk in completion:
if hasattr(chunk, "choices") and chunk.choices:
delta = chunk.choices[0].delta
# 检查音频数据
if hasattr(delta, "audio") and delta.audio:
audio_dict = delta.audio
if isinstance(audio_dict, dict) and 'data' in audio_dict and audio_dict['data']:
audio_data = audio_dict['data']
total_audio_length += len(audio_data)
audio_data_received = True
yield audio_data
else:
logger.debug(f"音频字典内容: {audio_dict}")
# 记录文本内容用于调试
if hasattr(delta, "content") and delta.content:
logger.debug(f"收到文本内容: {delta.content}")
if hasattr(chunk, "usage") and chunk.usage:
logger.info(f"本次使用量: {chunk.usage}")
logger.info(f"音频数据接收完成总base64长度: {total_audio_length}")
if not audio_data_received:
logger.warning("API调用成功但没有收到音频数据")
except Exception as e:
logger.error(f"Qwen Omni API调用失败: {e}")
logger.error(traceback.format_exc())
raise
async def generate_voice(self, text: str, style_hint: str = "default", language_hint: str | None = None) -> str | None:
"""生成语音的兼容接口"""
try:
logger.info(f"开始生成语音,文本: {text}")
audio_data = await self.tts(text)
if audio_data:
logger.info(f"语音生成成功,数据长度: {len(audio_data)} 字节")
# 直接返回base64编码的WAV数据
return base64.b64encode(audio_data).decode("utf-8")
else:
logger.error("语音生成失败audio_data 为 None")
return None
except Exception as e:
logger.error(f"Qwen Omni 语音生成失败: {e}")
logger.error(traceback.format_exc())
return None
@register_plugin
class TTSVoicePlugin(BasePlugin):
"""
GPT-SoVITS 语音合成插件 - 重构版
GPT-SoVITS 和 Qwen Omni 语音合成插件
"""
plugin_name = "tts_voice_plugin"
plugin_description = "基于GPT-SoVITS的文本转语音插件(重构版)"
plugin_version = "3.1.2"
plugin_author = "Kilo Code & 靚仔"
enable_plugin = True
plugin_description = "基于GPT-SoVITS和Qwen Omni的文本转语音插件"
plugin_version = "3.2.0"
plugin_author = "Kilo Code & 靓仔 & AI助手"
config_file_name = "config.toml"
dependencies: ClassVar[list[str]] = []
@@ -36,182 +217,133 @@ class TTSVoicePlugin(BasePlugin):
PermissionNodeField(node_name="command.use", description="是否可以使用 /tts 命令"),
]
config_schema: ClassVar[dict] = {}
# 使用 ConfigField 的配置架构
config_schema: ClassVar[dict] = {
"plugin": {
"enabled": ConfigField(type=bool, default=True, description="是否启用插件"),
"debug": ConfigField(type=bool, default=False, description="是否开启调试模式")
},
"components": {
"action_enabled": ConfigField(type=bool, default=True, description="是否启用TTS Action"),
"command_enabled": ConfigField(type=bool, default=True, description="是否启用TTS命令")
},
"tts": {
"server": ConfigField(type=str, default="http://127.0.0.1:9880", description="GPT-SoVITS服务器地址"),
"timeout": ConfigField(type=int, default=60, description="TTS请求超时时间"),
"max_text_length": ConfigField(type=int, default=500, description="最大文本长度"),
"engine": ConfigField(
type=str,
default="gpt-sovits",
description="TTS引擎选择",
choices=["gpt-sovits", "qwen-omni"]
)
},
"qwen_omni": {
"api_key": ConfigField(type=str, default="", description="Qwen Omni API密钥", required=True),
"base_url": ConfigField(
type=str,
default="https://dashscope.aliyuncs.com/compatible-mode/v1",
description="Qwen Omni API基础URL"
),
"model_name": ConfigField(type=str, default="qwen-omni-turbo", description="Qwen Omni模型名称"),
"voice_character": ConfigField(type=str, default="Chelsie", description="语音角色"),
"media_format": ConfigField(type=str, default="wav", description="音频格式")
},
"tts_advanced": {
"top_k": ConfigField(type=int, default=5, description="Top-K采样参数"),
"top_p": ConfigField(type=float, default=1.0, description="Top-P采样参数"),
"temperature": ConfigField(type=float, default=1.0, description="温度参数"),
"batch_size": ConfigField(type=int, default=1, description="批处理大小"),
"split_bucket": ConfigField(type=bool, default=True, description="是否启用分桶处理")
},
"spatial_effects": {
"enabled": ConfigField(type=bool, default=False, description="是否启用空间音效"),
"reverb_enabled": ConfigField(type=bool, default=True, description="是否启用混响效果"),
"room_size": ConfigField(type=float, default=0.15, description="混响房间大小"),
"damping": ConfigField(type=float, default=0.5, description="混响阻尼"),
"wet_level": ConfigField(type=float, default=0.33, description="湿声比例"),
"dry_level": ConfigField(type=float, default=0.4, description="干声比例"),
"width": ConfigField(type=float, default=1.0, description="立体声宽度"),
"convolution_enabled": ConfigField(type=bool, default=False, description="是否启用卷积混响"),
"convolution_mix": ConfigField(type=float, default=0.5, description="卷积混响干湿比")
}
}
config_section_descriptions: ClassVar[dict] = {
"plugin": "插件基本配置",
"components": "组件启用控制",
"components": "组件启用控制",
"tts": "TTS语音合成基础配置",
"tts_advanced": "TTS高级参数配置语速、采样、批处理等",
"tts_styles": "TTS风格参数配置(每个分组为一种风格)"
"qwen_omni": "Qwen Omni大模型TTS配置需要API Key",
"tts_advanced": "TTS高级参数配置",
"spatial_effects": "空间音频效果配置"
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.tts_service = None
def _create_default_config(self, config_file: Path):
"""
如果配置文件不存在,则创建一个默认的配置文件。
"""
if config_file.is_file():
return
logger.info(f"TTS 配置文件不存在,正在创建默认配置文件于: {config_file}")
default_config_content = """# 插件基础配置
[plugin]
enable = true
keywords = [
"发语音", "语音", "说句话", "用语音说", "听你", "听声音", "想听你", "想听声音",
"讲个话", "说段话", "念一下", "读一下", "用嘴说", "", "能发语音吗","亲口"
]
# 组件启用控制
[components]
action_enabled = true
command_enabled = true
# TTS 语音合成基础配置
[tts]
server = "http://127.0.0.1:9880"
timeout = 180
max_text_length = 1000
# TTS 风格参数配置
# 每个 [[tts_styles]] 代表一个独立的语音风格配置
[[tts_styles]]
# 风格的唯一标识符,必须有一个名为 "default"
style_name = "default"
# 显示名称
name = "默认"
# 参考音频路径
refer_wav_path = "C:/path/to/your/reference.wav"
# 参考音频文本
prompt_text = "这是一个示例文本,请替换为您自己的参考音频文本。"
# 参考音频语言
prompt_language = "zh"
# GPT 模型路径
gpt_weights = "C:/path/to/your/gpt_weights.ckpt"
# SoVITS 模型路径
sovits_weights = "C:/path/to/your/sovits_weights.pth"
# 语速
speed_factor = 1.0
# TTS 高级参数配置
[tts_advanced]
media_type = "wav"
top_k = 9
top_p = 0.8
temperature = 0.8
batch_size = 6
batch_threshold = 0.75
text_split_method = "cut5"
repetition_penalty = 1.4
sample_steps = 150
super_sampling = true
# 空间音效配置
[spatial_effects]
# 是否启用空间音效处理
enabled = false
# 是否启用标准混响效果
reverb_enabled = false
# 混响的房间大小 (建议范围 0.0-1.0)
room_size = 0.2
# 混响的阻尼/高频衰减 (建议范围 0.0-1.0)
damping = 0.6
# 混响的湿声(效果声)比例 (建议范围 0.0-1.0)
wet_level = 0.3
# 混响的干声(原声)比例 (建议范围 0.0-1.0)
dry_level = 0.8
# 混响的立体声宽度 (建议范围 0.0-1.0)
width = 1.0
# 是否启用卷积混响需要assets/small_room_ir.wav文件
convolution_enabled = false
# 卷积混响的干湿比 (建议范围 0.0-1.0)
convolution_mix = 0.7
"""
try:
config_file.parent.mkdir(parents=True, exist_ok=True)
with open(config_file, "w", encoding="utf-8") as f:
f.write(default_config_content.strip())
logger.info("默认 TTS 配置文件创建成功。")
logger.info("TTSVoicePlugin 初始化开始")
super().__init__(*args, **kwargs)
self.tts_service = None
logger.info("TTSVoicePlugin 初始化完成")
except Exception as e:
logger.error(f"创建默认 TTS 配置文件失败: {e}")
def _get_config_wrapper(self, key: str, default: Any = None) -> Any:
"""
配置获取的包装器,用于解决 get_config 无法直接获取动态表(如 tts_styles和未在 schema 中定义的节的问题。
由于插件系统的 schema 为空时不会加载未定义的键,这里手动读取配置文件以获取所需配置。
"""
# 需要手动加载的顶级配置节
manual_load_keys = ["tts_styles", "spatial_effects", "tts_advanced", "tts"]
top_key = key.split(".")[0]
if top_key in manual_load_keys:
try:
plugin_file = Path(__file__).resolve()
bot_root = plugin_file.parent.parent.parent.parent.parent
config_file = bot_root / "config" / "plugins" / self.plugin_name / self.config_file_name
if not config_file.is_file():
logger.error(f"TTS config file not found at robustly constructed path: {config_file}")
return default
full_config = toml.loads(config_file.read_text(encoding="utf-8"))
# 支持点状路径访问
value = full_config
for k in key.split("."):
if isinstance(value, dict):
value = value.get(k)
else:
return default
return value if value is not None else default
except Exception as e:
logger.error(f"Failed to manually load '{key}' from config: {e}")
return default
return self.get_config(key, default)
logger.error(f"TTSVoicePlugin 初始化失败: {e}")
logger.error(traceback.format_exc())
raise
async def on_plugin_loaded(self):
"""
插件加载完成后的回调,初始化并注册服务。
"""
logger.info("初始化 TTSVoicePlugin...")
try:
logger.info("开始初始化 TTSVoicePlugin...")
plugin_file = Path(__file__).resolve()
bot_root = plugin_file.parent.parent.parent.parent.parent
config_file = bot_root / "config" / "plugins" / self.plugin_name / self.config_file_name
self._create_default_config(config_file)
# 获取当前使用的TTS引擎
engine = self.get_config("tts.engine", "gpt-sovits")
logger.info(f"当前TTS引擎: {engine}")
# 实例化 TTSService并传入 get_config 方法
self.tts_service = TTSService(self._get_config_wrapper)
if engine == "gpt-sovits":
# 实例化 GPT-SoVITS 服务
logger.info("初始化 GPT-SoVITS 服务...")
self.tts_service = TTSService(self.get_config)
register_service("tts", self.tts_service)
logger.info("GPT-SoVITS TTSService 已成功初始化并注册。")
elif engine == "qwen-omni":
# 检查API Key
api_key = self.get_config("qwen_omni.api_key", "")
if not api_key or api_key == "your-api-key-here":
logger.error("Qwen Omni 需要配置有效的 API Key请在插件配置中设置 qwen_omni.api_key")
# 创建空服务,避免后续调用出错
self.tts_service = None
else:
# 实例化 Qwen Omni 服务
logger.info("初始化 Qwen Omni 服务...")
self.tts_service = QwenOmniTTSModel(self.get_config)
register_service("tts", self.tts_service)
logger.info("Qwen Omni TTSModel 已成功初始化并注册。")
else:
logger.error(f"不支持的 TTS 引擎: {engine}")
self.tts_service = None
# 注册服务
register_service("tts", self.tts_service)
logger.info("TTSService 已成功初始化并注册。")
logger.info("TTSVoicePlugin 初始化完成")
except Exception as e:
logger.error(f"TTSVoicePlugin 初始化过程中发生错误: {e}")
logger.error(traceback.format_exc())
# 不要重新抛出异常,避免影响主程序
def get_plugin_components(self) -> list[tuple[ComponentInfo, type]]:
"""
返回插件包含的组件列表。
"""
components = []
if self.get_config("components.action_enabled", True):
components.append((TTSVoiceAction.get_action_info(), TTSVoiceAction))
if self.get_config("components.command_enabled", True):
components.append((TTSVoiceCommand.get_plus_command_info(), TTSVoiceCommand))
return components
try:
components = []
if self.get_config("components.action_enabled", True):
components.append((TTSVoiceAction.get_action_info(), TTSVoiceAction))
if self.get_config("components.command_enabled", True):
components.append((TTSVoiceCommand.get_plus_command_info(), TTSVoiceCommand))
logger.info(f"加载了 {len(components)} 个组件")
return components
except Exception as e:
logger.error(f"获取插件组件失败: {e}")
logger.error(traceback.format_exc())
return []

View File

@@ -1,11 +1,12 @@
"""
TTS 核心服务
TTS 核心服务 - GPT-SoVITS 专用
"""
import asyncio
import base64
import io
import os
import re
import traceback
from collections.abc import Callable
from typing import Any
@@ -20,7 +21,7 @@ logger = get_logger("tts_voice_plugin.service")
class TTSService:
"""封装了TTS合成的核心逻辑"""
"""GPT-SoVITS TTS 服务"""
def __init__(self, get_config_func: Callable[[str, Any], Any]):
self.get_config = get_config_func
@@ -37,133 +38,133 @@ class TTSService:
self.tts_styles = self._load_tts_styles()
if self.tts_styles:
logger.info(f"TTS服务已成功加载风格: {list(self.tts_styles.keys())}")
logger.info(f"GPT-SoVITS服务已成功加载风格: {list(self.tts_styles.keys())}")
else:
logger.warning("TTS风格配置为空请检查配置文件")
except Exception as e:
logger.error(f"TTS服务配置加载失败: {e}")
logger.error(f"GPT-SoVITS服务配置加载失败: {e}")
logger.error(traceback.format_exc())
def _load_tts_styles(self) -> dict[str, dict[str, Any]]:
"""加载 TTS 风格配置"""
styles = {}
global_server = self.get_config("tts.server", "http://127.0.0.1:9880")
tts_styles_config = self.get_config("tts_styles", [])
try:
styles = {}
global_server = self.get_config("tts.server", "http://127.0.0.1:9880")
tts_styles_config = self.get_config("tts_styles", [])
if not isinstance(tts_styles_config, list):
logger.error(f"tts_styles 配置不是一个列表, 而是 {type(tts_styles_config)}")
if not isinstance(tts_styles_config, list):
logger.error(f"tts_styles 配置不是一个列表, 而是 {type(tts_styles_config)}")
return styles
default_cfg = next((s for s in tts_styles_config if s.get("style_name") == "default"), None)
if not default_cfg:
logger.error("在 tts_styles 配置中未找到 'default' 风格,这是必需的。")
return styles
default_refer_wav = default_cfg.get("refer_wav_path", "")
default_prompt_text = default_cfg.get("prompt_text", "")
default_gpt_weights = default_cfg.get("gpt_weights", "")
default_sovits_weights = default_cfg.get("sovits_weights", "")
if not default_refer_wav:
logger.warning("TTS 'default' style is missing 'refer_wav_path'.")
for style_cfg in tts_styles_config:
if not isinstance(style_cfg, dict):
continue
style_name = style_cfg.get("style_name")
if not style_name:
continue
styles[style_name] = {
"url": global_server,
"name": style_cfg.get("name", style_name),
"refer_wav_path": style_cfg.get("refer_wav_path", default_refer_wav),
"prompt_text": style_cfg.get("prompt_text", default_prompt_text),
"prompt_language": style_cfg.get("prompt_language", "zh"),
"gpt_weights": style_cfg.get("gpt_weights", default_gpt_weights),
"sovits_weights": style_cfg.get("sovits_weights", default_sovits_weights),
"speed_factor": style_cfg.get("speed_factor"),
"text_language": style_cfg.get("text_language", "auto"),
}
return styles
default_cfg = next((s for s in tts_styles_config if s.get("style_name") == "default"), None)
if not default_cfg:
logger.error("在 tts_styles 配置中未找到 'default' 风格,这是必需的。")
return styles
default_refer_wav = default_cfg.get("refer_wav_path", "")
default_prompt_text = default_cfg.get("prompt_text", "")
default_gpt_weights = default_cfg.get("gpt_weights", "")
default_sovits_weights = default_cfg.get("sovits_weights", "")
if not default_refer_wav:
logger.warning("TTS 'default' style is missing 'refer_wav_path'.")
for style_cfg in tts_styles_config:
if not isinstance(style_cfg, dict):
continue
style_name = style_cfg.get("style_name")
if not style_name:
continue
styles[style_name] = {
"url": global_server,
"name": style_cfg.get("name", style_name),
"refer_wav_path": style_cfg.get("refer_wav_path", default_refer_wav),
"prompt_text": style_cfg.get("prompt_text", default_prompt_text),
"prompt_language": style_cfg.get("prompt_language", "zh"),
"gpt_weights": style_cfg.get("gpt_weights", default_gpt_weights),
"sovits_weights": style_cfg.get("sovits_weights", default_sovits_weights),
"speed_factor": style_cfg.get("speed_factor"),
"text_language": style_cfg.get("text_language", "auto"), # 新增:读取文本语言模式
}
return styles
except Exception as e:
logger.error(f"加载TTS风格配置失败: {e}")
logger.error(traceback.format_exc())
return {}
def _determine_final_language(self, text: str, mode: str) -> str:
"""根据配置的语言策略和文本内容决定最终发送给API的语言代码"""
# 如果策略是具体的语言(如 all_zh, ja直接使用
if mode not in ["auto", "auto_yue"]:
return mode
try:
if mode not in ["auto", "auto_yue"]:
return mode
# 对于 auto 和 auto_yue 策略,进行内容检测
# 优先检测粤语
if mode == "auto_yue":
cantonese_keywords = ["", "", "", "", "", "", "", "", ""]
if any(keyword in text for keyword in cantonese_keywords):
logger.info("在 auto_yue 模式下检测到粤语关键词,最终语言: yue")
return "yue"
if mode == "auto_yue":
cantonese_keywords = ["", "", "", "", "", "", "", "", ""]
if any(keyword in text for keyword in cantonese_keywords):
logger.info("在 auto_yue 模式下检测到粤语关键词,最终语言: yue")
return "yue"
# 检测日语(简单启发式规则)
japanese_chars = len(re.findall(r"[\u3040-\u309f\u30a0-\u30ff]", text))
if japanese_chars > 5 and japanese_chars > len(re.findall(r"[\u4e00-\u9fff]", text)) * 0.5:
logger.info("检测到日语字符,最终语言: ja")
return "ja"
japanese_chars = len(re.findall(r"[\u3040-\u309f\u30a0-\u30ff]", text))
if japanese_chars > 5 and japanese_chars > len(re.findall(r"[\u4e00-\u9fff]", text)) * 0.5:
logger.info("检测到日语字符,最终语言: ja")
return "ja"
# 默认回退到中文
logger.info(f"{mode} 模式下未检测到特定语言,默认回退到: zh")
return "zh"
logger.info(f"{mode} 模式下未检测到特定语言,默认回退到: zh")
return "zh"
except Exception as e:
logger.error(f"语言检测失败: {e}")
return "zh"
def _clean_text_for_tts(self, text: str) -> str:
# 1. 基本清理
text = re.sub(r"[\(\[【].*?[\)\]】]", "", text)
text = re.sub(r"([,。!?、;:,.!?;:~\-`])\1+", r"\1", text)
text = re.sub(r"~{2,}|{2,}", "", text)
text = re.sub(r"\.{3,}|…{1,}", "", text)
# 2. 词语替换
replacements = {"www": "哈哈哈", "hhh": "哈哈", "233": "哈哈", "666": "厉害", "88": "拜拜"}
for old, new in replacements.items():
text = text.replace(old, new)
# 3. 移除不必要的字符 (恢复使用更安全的原版正则,避免误删)
text = re.sub(r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,.!?;:~]", "", text)
# 4. 确保结尾有标点
if text and not text.endswith(tuple(",。!?、;:,.!?;:")):
text += ""
# 5. 智能截断 (保留改进的截断逻辑)
if len(text) > self.max_text_length:
cut_text = text[:self.max_text_length]
punctuation = "。!?.…"
last_punc_pos = max(cut_text.rfind(p) for p in punctuation)
if last_punc_pos != -1:
text = cut_text[:last_punc_pos + 1]
else:
last_comma_pos = max(cut_text.rfind(p) for p in ",、;,;")
if last_comma_pos != -1:
text = cut_text[:last_comma_pos + 1]
else:
text = cut_text
return text.strip()
async def _call_tts_api(self, server_config: dict, text: str, text_language: str, **kwargs) -> bytes | None:
"""
最终修复版:先切换模型,然后仅通过路径发送合成请求。
"""
ref_wav_path = kwargs.get("refer_wav_path")
if not ref_wav_path:
logger.error(f"API 调用失败:缺少 refer_wav_path。当前风格配置: {server_config}")
return None
"""清理文本使其适合TTS"""
try:
text = re.sub(r"[\(\[【].*?[\)\]】]", "", text)
text = re.sub(r"([,。!?、;:,.!?;:~\-`])\1+", r"\1", text)
text = re.sub(r"~{2,}|{2,}", "", text)
text = re.sub(r"\.{3,}|…{1,}", "", text)
replacements = {"www": "哈哈哈", "hhh": "哈哈", "233": "哈哈", "666": "厉害", "88": "拜拜"}
for old, new in replacements.items():
text = text.replace(old, new)
text = re.sub(r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,.!?;:~]", "", text)
if text and not text.endswith(tuple(",。!?、;:,.!?;:")):
text += ""
if len(text) > self.max_text_length:
cut_text = text[:self.max_text_length]
punctuation = "。!?.…"
last_punc_pos = max(cut_text.rfind(p) for p in punctuation)
if last_punc_pos != -1:
text = cut_text[:last_punc_pos + 1]
else:
last_comma_pos = max(cut_text.rfind(p) for p in ",、;,;")
if last_comma_pos != -1:
text = cut_text[:last_comma_pos + 1]
else:
text = cut_text
return text.strip()
except Exception as e:
logger.error(f"文本清理失败: {e}")
return text
async def _call_gpt_sovits_api(self, server_config: dict, text: str, text_language: str, **kwargs) -> bytes | None:
"""调用 GPT-SoVITS API"""
try:
ref_wav_path = kwargs.get("refer_wav_path")
if not ref_wav_path:
logger.error(f"API 调用失败:缺少 refer_wav_path。当前风格配置: {server_config}")
return None
base_url = server_config["url"].rstrip("/")
# --- 步骤一:像稳定版一样,先切换模型 ---
async def switch_model_weights(weights_path: str | None, weight_type: str):
if not weights_path:
return
api_endpoint = f"/set_{weight_type}_weights"
switch_url = f"{base_url}{api_endpoint}"
@@ -181,30 +182,23 @@ class TTSService:
await switch_model_weights(kwargs.get("gpt_weights"), "gpt")
await switch_model_weights(kwargs.get("sovits_weights"), "sovits")
# --- 步骤二构建纯净的、不含Base64的请求数据 ---
data = {
"text": text,
"text_lang": text_language,
"ref_audio_path": ref_wav_path,
"prompt_text": kwargs.get("prompt_text", ""),
"prompt_lang": kwargs.get("prompt_language", "zh"),
# 在稳定版中这两个参数是通过API切换的而不是直接放在请求体里
# "gpt_model_path": kwargs.get("gpt_weights"),
# "sovits_model_path": kwargs.get("sovits_weights"),
}
# 合并高级配置
advanced_config = self.get_config("tts_advanced", {})
if isinstance(advanced_config, dict):
data.update({k: v for k, v in advanced_config.items() if v is not None})
# 优先使用风格特定的语速
if server_config.get("speed_factor") is not None:
data["speed_factor"] = server_config["speed_factor"]
# --- 步骤三:发送最终的合成请求 ---
tts_url = base_url if base_url.endswith("/tts") else f"{base_url}/tts"
logger.info(f"发送到 TTS API 的数据: {data}")
logger.info(f"发送到 GPT-SoVITS API 的数据: {data}")
async with aiohttp.ClientSession() as session:
async with session.post(tts_url, json=data, timeout=aiohttp.ClientTimeout(total=self.timeout)) as response:
@@ -212,13 +206,14 @@ class TTSService:
return await response.read()
else:
error_info = await response.text()
logger.error(f"TTS API调用失败: {response.status} - {error_info}")
logger.error(f"GPT-SoVITS API调用失败: {response.status} - {error_info}")
return None
except asyncio.TimeoutError:
logger.error("TTS服务请求超时")
logger.error("GPT-SoVITS服务请求超时")
return None
except Exception as e:
logger.error(f"TTS API调用异常: {e}")
logger.error(f"GPT-SoVITS API调用异常: {e}")
logger.error(traceback.format_exc())
return None
async def _apply_spatial_audio_effect(self, audio_data: bytes) -> bytes | None:
@@ -226,19 +221,14 @@ class TTSService:
try:
effects_config = self.get_config("spatial_effects", {})
if not effects_config.get("enabled", False):
return audio_data
# 获取插件目录和IR文件路径
# 基于 __file__ 构建稳健的、独立于当前工作目录的路径
plugin_file = os.path.abspath(__file__)
# services -> tts_voice_plugin -> plugins -> Bot
bot_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(plugin_file))))
ir_path = os.path.join(bot_root, "assets", "small_room_ir.wav")
effects = []
# 根据配置添加Reverb效果
if effects_config.get("reverb_enabled", False):
effects.append(Reverb(
room_size=effects_config.get("room_size", 0.15),
@@ -248,7 +238,6 @@ class TTSService:
width=effects_config.get("width", 1.0)
))
# 根据配置添加Convolution效果
if effects_config.get("convolution_enabled", False) and os.path.exists(ir_path):
effects.append(Convolution(
impulse_response_filename=ir_path,
@@ -258,19 +247,14 @@ class TTSService:
logger.warning(f"卷积混响已启用但IR文件不存在 ({ir_path}),跳过该效果。")
if not effects:
return audio_data
# 将原始音频数据加载到内存中的 AudioFile 对象
with io.BytesIO(audio_data) as audio_stream:
with AudioFile(audio_stream, "r") as f:
board = Pedalboard(effects)
effected = board(f.read(f.frames), f.samplerate)
# 将处理后的音频数据写回内存中的字节流
with io.BytesIO() as output_stream:
# 使用 soundfile 写入,因为它更稳定
sf.write(output_stream, effected.T, f.samplerate, format="WAV")
processed_audio_data = output_stream.getvalue()
@@ -279,66 +263,69 @@ class TTSService:
except Exception as e:
logger.error(f"应用空间效果时出错: {e}")
return audio_data # 如果出错,返回原始音频
logger.error(traceback.format_exc())
return audio_data
async def generate_voice(self, text: str, style_hint: str = "default", language_hint: str | None = None) -> str | None:
self._load_config()
"""生成语音 - GPT-SoVITS 专用"""
try:
self._load_config()
if not self.tts_styles:
logger.error("TTS风格配置为空无法生成语音。")
return None
style = style_hint if style_hint in self.tts_styles else "default"
if style not in self.tts_styles:
if "default" in self.tts_styles:
style = "default"
logger.warning(f"指定风格 '{style_hint}' 不存在,自动回退到: 'default'")
elif self.tts_styles:
style = next(iter(self.tts_styles))
logger.warning(f"指定风格 '{style_hint}''default' 均不存在,自动回退到第一个可用风格: {style}")
else:
logger.error("没有任何可用的TTS风格配置")
if not self.tts_styles:
logger.error("TTS风格配置为空无法生成语音。")
return None
server_config = self.tts_styles[style]
clean_text = self._clean_text_for_tts(text)
if not clean_text:
style = style_hint if style_hint in self.tts_styles else "default"
if style not in self.tts_styles:
if "default" in self.tts_styles:
style = "default"
logger.warning(f"指定风格 '{style_hint}' 不存在,自动回退到: 'default'")
elif self.tts_styles:
style = next(iter(self.tts_styles))
logger.warning(f"指定风格 '{style_hint}''default' 均不存在,自动回退到第一个可用风格: {style}")
else:
logger.error("没有任何可用的TTS风格配置")
return None
server_config = self.tts_styles[style]
clean_text = self._clean_text_for_tts(text)
if not clean_text:
return None
if language_hint:
final_language = language_hint
logger.info(f"使用决策模型指定的语言: {final_language}")
else:
language_policy = server_config.get("text_language", "auto")
final_language = self._determine_final_language(clean_text, language_policy)
logger.info(f"决策模型未指定语言,使用策略 '{language_policy}' -> 最终语言: {final_language}")
logger.info(f"开始GPT-SoVITS语音合成文本{clean_text[:50]}..., 风格:{style}, 最终语言: {final_language}")
audio_data = await self._call_gpt_sovits_api(
server_config=server_config, text=clean_text, text_language=final_language,
refer_wav_path=server_config.get("refer_wav_path"),
prompt_text=server_config.get("prompt_text"),
prompt_language=server_config.get("prompt_language"),
gpt_weights=server_config.get("gpt_weights"),
sovits_weights=server_config.get("sovits_weights"),
)
if audio_data:
spatial_config = self.get_config("spatial_effects", {})
if spatial_config.get("enabled", False):
logger.info("检测到已启用空间音频效果,开始处理...")
processed_audio = await self._apply_spatial_audio_effect(audio_data)
if processed_audio:
logger.info("空间音频效果应用成功!")
audio_data = processed_audio
else:
logger.warning("空间音频效果应用失败,将使用原始音频。")
return base64.b64encode(audio_data).decode("utf-8")
return None
# 语言决策流程:
# 1. 优先使用决策模型直接指定的 language_hint (最高优先级)
if language_hint:
final_language = language_hint
logger.info(f"使用决策模型指定的语言: {final_language}")
else:
# 2. 如果模型未指定,则使用风格配置的 language_policy
language_policy = server_config.get("text_language", "auto")
final_language = self._determine_final_language(clean_text, language_policy)
logger.info(f"决策模型未指定语言,使用策略 '{language_policy}' -> 最终语言: {final_language}")
logger.info(f"开始TTS语音合成文本{clean_text[:50]}..., 风格:{style}, 最终语言: {final_language}")
audio_data = await self._call_tts_api(
server_config=server_config, text=clean_text, text_language=final_language,
refer_wav_path=server_config.get("refer_wav_path"),
prompt_text=server_config.get("prompt_text"),
prompt_language=server_config.get("prompt_language"),
gpt_weights=server_config.get("gpt_weights"),
sovits_weights=server_config.get("sovits_weights"),
)
if audio_data:
# 检查是否启用空间音频效果
spatial_config = self.get_config("spatial_effects", {})
if spatial_config.get("enabled", False):
logger.info("检测到已启用空间音频效果,开始处理...")
processed_audio = await self._apply_spatial_audio_effect(audio_data)
if processed_audio:
logger.info("空间音频效果应用成功!")
audio_data = processed_audio
else:
logger.warning("空间音频效果应用失败,将使用原始音频。")
return base64.b64encode(audio_data).decode("utf-8")
return None
except Exception as e:
logger.error(f"语音合成失败: {e}")
logger.error(traceback.format_exc())
return None