feat(voice): 实现自身语音缓存以跳过ASR识别
当机器人通过TTS插件发送语音时,会自动将语音数据和对应的原始文本进行缓存。 当机器人接收到自己发送的语音消息时,会优先从缓存中直接读取文本,从而跳过调用ASR(自动语音识别)服务的步骤。 此项优化可以: - 节省不必要的ASR资源开销和费用。 - 保证对机器人自身语音100%的识别准确性。 - 提升对自身消息的响应处理速度。
This commit is contained in:
@@ -12,6 +12,7 @@ from src.chat.message_receive.chat_stream import ChatStream
|
||||
from src.chat.utils.utils_image import get_image_manager
|
||||
from src.chat.utils.utils_video import get_video_analyzer, is_video_analysis_available
|
||||
from src.chat.utils.utils_voice import get_voice_text
|
||||
from src.chat.utils.self_voice_cache import consume_self_voice_text
|
||||
from src.common.logger import get_logger
|
||||
from src.config.config import global_config
|
||||
|
||||
@@ -30,7 +31,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
@dataclass
|
||||
class Message(MessageBase, metaclass=ABCMeta):
|
||||
chat_stream: "ChatStream" = None # type: ignore
|
||||
chat_stream: Optional["ChatStream"] = None
|
||||
reply: Optional["Message"] = None
|
||||
processed_plain_text: str = ""
|
||||
memorized_times: int = 0
|
||||
@@ -170,10 +171,10 @@ class MessageRecv(Message):
|
||||
self.is_emoji = False
|
||||
self.is_video = False
|
||||
# 处理at消息,格式为"昵称:QQ号"
|
||||
if segment.data and ":" in segment.data:
|
||||
if isinstance(segment.data, str) and ":" in segment.data:
|
||||
nickname, qq_id = segment.data.split(":", 1)
|
||||
return f"@{nickname}"
|
||||
return f"@{segment.data}" if segment.data else "@未知用户"
|
||||
return f"@{segment.data}" if isinstance(segment.data, str) else "@未知用户"
|
||||
elif segment.type == "image":
|
||||
# 如果是base64图片数据
|
||||
if isinstance(segment.data, str):
|
||||
@@ -200,6 +201,19 @@ class MessageRecv(Message):
|
||||
self.is_emoji = False
|
||||
self.is_voice = True
|
||||
self.is_video = False
|
||||
|
||||
# 检查消息是否由机器人自己发送
|
||||
if self.message_info and self.message_info.user_info and str(self.message_info.user_info.user_id) == str(global_config.bot.qq_account):
|
||||
logger.info(f"检测到机器人自身发送的语音消息 (User ID: {self.message_info.user_info.user_id}),尝试从缓存获取文本。")
|
||||
if isinstance(segment.data, str):
|
||||
cached_text = consume_self_voice_text(segment.data)
|
||||
if cached_text:
|
||||
logger.info(f"成功从缓存中获取语音文本: '{cached_text[:70]}...'")
|
||||
return f"[语音:{cached_text}]"
|
||||
else:
|
||||
logger.warning("机器人自身语音消息缓存未命中,将回退到标准语音识别。")
|
||||
|
||||
# 标准语音识别流程 (也作为缓存未命中的后备方案)
|
||||
if isinstance(segment.data, str):
|
||||
return await get_voice_text(segment.data)
|
||||
return "[发了一段语音,网卡了加载不出来]"
|
||||
@@ -298,7 +312,7 @@ class MessageRecvS4U(MessageRecv):
|
||||
self.is_superchat = False
|
||||
self.gift_info = None
|
||||
self.gift_name = None
|
||||
self.gift_count: str | None = None
|
||||
self.gift_count: int | None = None
|
||||
self.superchat_info = None
|
||||
self.superchat_price = None
|
||||
self.superchat_message_text = None
|
||||
@@ -350,6 +364,20 @@ class MessageRecvS4U(MessageRecv):
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
self.is_voice = True
|
||||
|
||||
# 检查消息是否由机器人自己发送
|
||||
# 检查消息是否由机器人自己发送
|
||||
if self.message_info and self.message_info.user_info and str(self.message_info.user_info.user_id) == str(global_config.bot.qq_account):
|
||||
logger.info(f"检测到机器人自身发送的语音消息 (User ID: {self.message_info.user_info.user_id}),尝试从缓存获取文本。")
|
||||
if isinstance(segment.data, str):
|
||||
cached_text = consume_self_voice_text(segment.data)
|
||||
if cached_text:
|
||||
logger.info(f"成功从缓存中获取语音文本: '{cached_text[:70]}...'")
|
||||
return f"[语音:{cached_text}]"
|
||||
else:
|
||||
logger.warning("机器人自身语音消息缓存未命中,将回退到标准语音识别。")
|
||||
|
||||
# 标准语音识别流程 (也作为缓存未命中的后备方案)
|
||||
if isinstance(segment.data, str):
|
||||
return await get_voice_text(segment.data)
|
||||
return "[发了一段语音,网卡了加载不出来]"
|
||||
@@ -435,8 +463,8 @@ class MessageRecvS4U(MessageRecv):
|
||||
|
||||
# 使用video analyzer分析视频
|
||||
video_analyzer = get_video_analyzer()
|
||||
result = await video_analyzer.analyze_video(
|
||||
video_bytes, filename, prompt=global_config.video_analysis.batch_analysis_prompt
|
||||
result = await video_analyzer.analyze_video_from_bytes(
|
||||
video_bytes, filename
|
||||
)
|
||||
|
||||
logger.info(f"视频分析结果: {result}")
|
||||
@@ -524,15 +552,28 @@ class MessageProcessBase(Message):
|
||||
return await get_image_manager().get_emoji_tag(seg.data)
|
||||
return "[表情,网卡了加载不出来]"
|
||||
elif seg.type == "voice":
|
||||
# 检查消息是否由机器人自己发送
|
||||
# 检查消息是否由机器人自己发送
|
||||
if self.message_info and self.message_info.user_info and str(self.message_info.user_info.user_id) == str(global_config.bot.qq_account):
|
||||
logger.info(f"检测到机器人自身发送的语音消息 (User ID: {self.message_info.user_info.user_id}),尝试从缓存获取文本。")
|
||||
if isinstance(seg.data, str):
|
||||
cached_text = consume_self_voice_text(seg.data)
|
||||
if cached_text:
|
||||
logger.info(f"成功从缓存中获取语音文本: '{cached_text[:70]}...'")
|
||||
return f"[语音:{cached_text}]"
|
||||
else:
|
||||
logger.warning("机器人自身语音消息缓存未命中,将回退到标准语音识别。")
|
||||
|
||||
# 标准语音识别流程 (也作为缓存未命中的后备方案)
|
||||
if isinstance(seg.data, str):
|
||||
return await get_voice_text(seg.data)
|
||||
return "[发了一段语音,网卡了加载不出来]"
|
||||
elif seg.type == "at":
|
||||
# 处理at消息,格式为"昵称:QQ号"
|
||||
if seg.data and ":" in seg.data:
|
||||
if isinstance(seg.data, str) and ":" in seg.data:
|
||||
nickname, qq_id = seg.data.split(":", 1)
|
||||
return f"@{nickname}"
|
||||
return f"@{seg.data}" if seg.data else "@未知用户"
|
||||
return f"@{seg.data}" if isinstance(seg.data, str) else "@未知用户"
|
||||
elif seg.type == "reply":
|
||||
if self.reply and hasattr(self.reply, "processed_plain_text"):
|
||||
# print(f"self.reply.processed_plain_text: {self.reply.processed_plain_text}")
|
||||
@@ -617,7 +658,8 @@ class MessageSending(MessageProcessBase):
|
||||
|
||||
def to_dict(self):
|
||||
ret = super().to_dict()
|
||||
ret["message_info"]["user_info"] = self.chat_stream.user_info.to_dict()
|
||||
if self.chat_stream and self.chat_stream.user_info:
|
||||
ret["message_info"]["user_info"] = self.chat_stream.user_info.to_dict()
|
||||
return ret
|
||||
|
||||
def is_private_message(self) -> bool:
|
||||
|
||||
42
src/chat/utils/self_voice_cache.py
Normal file
42
src/chat/utils/self_voice_cache.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
自我语音缓存模块
|
||||
|
||||
用于在机器人发送TTS语音后,临时存储其原始文本,
|
||||
以便在接收到该语音消息时,能够直接获取文本内容,
|
||||
避免不必要的自我语音识别。
|
||||
"""
|
||||
import hashlib
|
||||
from typing import Dict
|
||||
|
||||
# 一个简单的内存缓存,用于将机器人自己发送的语音消息映射到其原始文本。
|
||||
# 键是语音base64内容的SHA256哈希值。
|
||||
_self_voice_cache: Dict[str, str] = {}
|
||||
|
||||
def get_voice_key(base64_content: str) -> str:
|
||||
"""为语音内容生成一个一致的键。"""
|
||||
return hashlib.sha256(base64_content.encode('utf-8')).hexdigest()
|
||||
|
||||
def register_self_voice(base64_content: str, text: str):
|
||||
"""
|
||||
为机器人自己发送的语音消息注册其原始文本。
|
||||
|
||||
Args:
|
||||
base64_content (str): 语音的base64编码内容。
|
||||
text (str): 原始文本。
|
||||
"""
|
||||
key = get_voice_key(base64_content)
|
||||
_self_voice_cache[key] = text
|
||||
|
||||
def consume_self_voice_text(base64_content: str) -> str | None:
|
||||
"""
|
||||
获取并移除机器人自己发送的语音消息的原始文本。
|
||||
这是一个一次性操作,获取后即从缓存中删除。
|
||||
|
||||
Args:
|
||||
base64_content (str): 语音的base64编码内容。
|
||||
|
||||
Returns:
|
||||
str | None: 如果找到,则返回原始文本,否则返回None。
|
||||
"""
|
||||
key = get_voice_key(base64_content)
|
||||
return _self_voice_cache.pop(key, None)
|
||||
@@ -8,6 +8,7 @@ import toml
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.plugin_system.base.base_action import BaseAction, ChatMode
|
||||
from src.chat.utils.self_voice_cache import register_self_voice
|
||||
|
||||
from ..services.manager import get_service
|
||||
|
||||
@@ -174,6 +175,8 @@ class TTSVoiceAction(BaseAction):
|
||||
)
|
||||
|
||||
if audio_b64:
|
||||
# 在发送语音前,将文本注册到缓存中
|
||||
register_self_voice(audio_b64, text)
|
||||
await self.send_custom(message_type="voice", content=audio_b64)
|
||||
logger.info(f"{self.log_prefix} GPT-SoVITS语音发送成功")
|
||||
await self.store_action_info(
|
||||
|
||||
Reference in New Issue
Block a user