初始化
This commit is contained in:
13
src/chat/__init__.py
Normal file
13
src/chat/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""
|
||||
MaiBot模块系统
|
||||
包含聊天、情绪、记忆、日程等功能模块
|
||||
"""
|
||||
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.chat.emoji_system.emoji_manager import get_emoji_manager
|
||||
|
||||
# 导出主要组件供外部使用
|
||||
__all__ = [
|
||||
"get_chat_manager",
|
||||
"get_emoji_manager",
|
||||
]
|
||||
929
src/chat/chat_loop/heartFC_chat.py
Normal file
929
src/chat/chat_loop/heartFC_chat.py
Normal file
@@ -0,0 +1,929 @@
|
||||
import asyncio
|
||||
import time
|
||||
import traceback
|
||||
import random
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from rich.traceback import install
|
||||
|
||||
from src.config.config import global_config
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.message_receive.chat_stream import ChatStream, get_chat_manager
|
||||
from src.chat.utils.prompt_builder import global_prompt_manager
|
||||
from src.chat.utils.timer_calculator import Timer
|
||||
from src.chat.planner_actions.planner import ActionPlanner
|
||||
from src.chat.planner_actions.action_modifier import ActionModifier
|
||||
from src.chat.planner_actions.action_manager import ActionManager
|
||||
from src.chat.chat_loop.hfc_utils import CycleDetail
|
||||
from src.person_info.relationship_builder_manager import relationship_builder_manager
|
||||
from src.chat.express.expression_learner import expression_learner_manager
|
||||
from src.person_info.person_info import get_person_info_manager
|
||||
from src.plugin_system.base.component_types import ActionInfo, ChatMode, EventType
|
||||
from src.plugin_system.core import events_manager
|
||||
from src.plugin_system.apis import generator_api, send_api, message_api, database_api
|
||||
from src.chat.willing.willing_manager import get_willing_manager
|
||||
from src.mais4u.mai_think import mai_thinking_manager
|
||||
from src.mais4u.constant_s4u import ENABLE_S4U
|
||||
from src.plugins.built_in.core_actions.no_reply import NoReplyAction
|
||||
from src.chat.chat_loop.hfc_utils import send_typing, stop_typing
|
||||
|
||||
ERROR_LOOP_INFO = {
|
||||
"loop_plan_info": {
|
||||
"action_result": {
|
||||
"action_type": "error",
|
||||
"action_data": {},
|
||||
"reasoning": "循环处理失败",
|
||||
},
|
||||
},
|
||||
"loop_action_info": {
|
||||
"action_taken": False,
|
||||
"reply_text": "",
|
||||
"command": "",
|
||||
"taken_time": time.time(),
|
||||
},
|
||||
}
|
||||
|
||||
NO_ACTION = {
|
||||
"action_result": {
|
||||
"action_type": "no_action",
|
||||
"action_data": {},
|
||||
"reasoning": "规划器初始化默认",
|
||||
"is_parallel": True,
|
||||
},
|
||||
"chat_context": "",
|
||||
"action_prompt": "",
|
||||
}
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
# 注释:原来的动作修改超时常量已移除,因为改为顺序执行
|
||||
|
||||
logger = get_logger("hfc") # Logger Name Changed
|
||||
|
||||
|
||||
class HeartFChatting:
|
||||
"""
|
||||
管理一个连续的Focus Chat循环
|
||||
用于在特定聊天流中生成回复。
|
||||
其生命周期现在由其关联的 SubHeartflow 的 FOCUSED 状态控制。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chat_id: str,
|
||||
):
|
||||
"""
|
||||
HeartFChatting 初始化函数
|
||||
|
||||
参数:
|
||||
chat_id: 聊天流唯一标识符(如stream_id)
|
||||
on_stop_focus_chat: 当收到stop_focus_chat命令时调用的回调函数
|
||||
performance_version: 性能记录版本号,用于区分不同启动版本
|
||||
"""
|
||||
# 基础属性
|
||||
self.stream_id: str = chat_id # 聊天流ID
|
||||
self.chat_stream: ChatStream = get_chat_manager().get_stream(self.stream_id) # type: ignore
|
||||
if not self.chat_stream:
|
||||
raise ValueError(f"无法找到聊天流: {self.stream_id}")
|
||||
self.log_prefix = f"[{get_chat_manager().get_stream_name(self.stream_id) or self.stream_id}]"
|
||||
|
||||
self.relationship_builder = relationship_builder_manager.get_or_create_builder(self.stream_id)
|
||||
self.expression_learner = expression_learner_manager.get_expression_learner(self.stream_id)
|
||||
|
||||
self.loop_mode = ChatMode.NORMAL # 初始循环模式为普通模式
|
||||
|
||||
self.last_action = "no_action"
|
||||
|
||||
self.action_manager = ActionManager()
|
||||
self.action_planner = ActionPlanner(chat_id=self.stream_id, action_manager=self.action_manager)
|
||||
self.action_modifier = ActionModifier(action_manager=self.action_manager, chat_id=self.stream_id)
|
||||
|
||||
# 循环控制内部状态
|
||||
self.running: bool = False
|
||||
self._loop_task: Optional[asyncio.Task] = None # 主循环任务
|
||||
self._energy_task: Optional[asyncio.Task] = None
|
||||
|
||||
# 添加循环信息管理相关的属性
|
||||
self.history_loop: List[CycleDetail] = []
|
||||
self._cycle_counter = 0
|
||||
self._current_cycle_detail: CycleDetail = None # type: ignore
|
||||
|
||||
self.reply_timeout_count = 0
|
||||
self.plan_timeout_count = 0
|
||||
|
||||
self.last_read_time = time.time() - 1
|
||||
|
||||
self.willing_manager = get_willing_manager()
|
||||
|
||||
logger.info(f"{self.log_prefix} HeartFChatting 初始化完成")
|
||||
|
||||
self.energy_value = 5
|
||||
|
||||
self.focus_energy = 1
|
||||
self.no_reply_consecutive = 0
|
||||
|
||||
async def start(self):
|
||||
"""检查是否需要启动主循环,如果未激活则启动。"""
|
||||
|
||||
# 如果循环已经激活,直接返回
|
||||
if self.running:
|
||||
logger.debug(f"{self.log_prefix} HeartFChatting 已激活,无需重复启动")
|
||||
return
|
||||
|
||||
try:
|
||||
# 标记为活动状态,防止重复启动
|
||||
self.running = True
|
||||
|
||||
self._energy_task = asyncio.create_task(self._energy_loop())
|
||||
self._energy_task.add_done_callback(self._handle_energy_completion)
|
||||
|
||||
self._loop_task = asyncio.create_task(self._main_chat_loop())
|
||||
self._loop_task.add_done_callback(self._handle_loop_completion)
|
||||
logger.info(f"{self.log_prefix} HeartFChatting 启动完成")
|
||||
|
||||
except Exception as e:
|
||||
# 启动失败时重置状态
|
||||
self.running = False
|
||||
self._loop_task = None
|
||||
logger.error(f"{self.log_prefix} HeartFChatting 启动失败: {e}")
|
||||
raise
|
||||
|
||||
def _handle_loop_completion(self, task: asyncio.Task):
|
||||
"""当 _hfc_loop 任务完成时执行的回调。"""
|
||||
try:
|
||||
if exception := task.exception():
|
||||
logger.error(f"{self.log_prefix} HeartFChatting: 脱离了聊天(异常): {exception}")
|
||||
logger.error(traceback.format_exc()) # Log full traceback for exceptions
|
||||
else:
|
||||
logger.info(f"{self.log_prefix} HeartFChatting: 脱离了聊天 (外部停止)")
|
||||
except asyncio.CancelledError:
|
||||
logger.info(f"{self.log_prefix} HeartFChatting: 结束了聊天")
|
||||
|
||||
def start_cycle(self):
|
||||
self._cycle_counter += 1
|
||||
self._current_cycle_detail = CycleDetail(self._cycle_counter)
|
||||
self._current_cycle_detail.thinking_id = f"tid{str(round(time.time(), 2))}"
|
||||
cycle_timers = {}
|
||||
return cycle_timers, self._current_cycle_detail.thinking_id
|
||||
|
||||
def end_cycle(self, loop_info, cycle_timers):
|
||||
self._current_cycle_detail.set_loop_info(loop_info)
|
||||
self.history_loop.append(self._current_cycle_detail)
|
||||
self._current_cycle_detail.timers = cycle_timers
|
||||
self._current_cycle_detail.end_time = time.time()
|
||||
|
||||
def _handle_energy_completion(self, task: asyncio.Task):
|
||||
if exception := task.exception():
|
||||
logger.error(f"{self.log_prefix} HeartFChatting: 能量循环异常: {exception}")
|
||||
logger.error(traceback.format_exc())
|
||||
else:
|
||||
logger.info(f"{self.log_prefix} HeartFChatting: 能量循环完成")
|
||||
|
||||
async def _energy_loop(self):
|
||||
while self.running:
|
||||
await asyncio.sleep(10)
|
||||
if self.loop_mode == ChatMode.NORMAL:
|
||||
self.energy_value -= 0.3
|
||||
self.energy_value = max(self.energy_value, 0.3)
|
||||
if self.loop_mode == ChatMode.FOCUS:
|
||||
self.energy_value -= 0.6
|
||||
self.energy_value = max(self.energy_value, 0.3)
|
||||
|
||||
def print_cycle_info(self, cycle_timers):
|
||||
# 记录循环信息和计时器结果
|
||||
timer_strings = []
|
||||
for name, elapsed in cycle_timers.items():
|
||||
formatted_time = f"{elapsed * 1000:.2f}毫秒" if elapsed < 1 else f"{elapsed:.2f}秒"
|
||||
timer_strings.append(f"{name}: {formatted_time}")
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} 第{self._current_cycle_detail.cycle_id}次思考,"
|
||||
f"耗时: {self._current_cycle_detail.end_time - self._current_cycle_detail.start_time:.1f}秒, " # type: ignore
|
||||
f"选择动作: {self._current_cycle_detail.loop_plan_info.get('action_result', {}).get('action_type', '未知动作')}"
|
||||
+ (f"\n详情: {'; '.join(timer_strings)}" if timer_strings else "")
|
||||
)
|
||||
|
||||
def _determine_form_type(self) -> str:
|
||||
"""判断使用哪种形式的no_reply"""
|
||||
# 如果连续no_reply次数少于3次,使用waiting形式
|
||||
if self.no_reply_consecutive <= 3:
|
||||
self.focus_energy = 1
|
||||
else:
|
||||
# 计算最近三次记录的兴趣度总和
|
||||
total_recent_interest = sum(NoReplyAction._recent_interest_records)
|
||||
|
||||
# 获取当前聊天频率和意愿系数
|
||||
talk_frequency = global_config.chat.get_current_talk_frequency(self.stream_id)
|
||||
|
||||
# 计算调整后的阈值
|
||||
adjusted_threshold = 3 / talk_frequency
|
||||
|
||||
logger.info(f"{self.log_prefix} 最近三次兴趣度总和: {total_recent_interest:.2f}, 调整后阈值: {adjusted_threshold:.2f}")
|
||||
|
||||
# 如果兴趣度总和小于阈值,进入breaking形式
|
||||
if total_recent_interest < adjusted_threshold:
|
||||
logger.info(f"{self.log_prefix} 兴趣度不足,进入breaking形式")
|
||||
self.focus_energy = random.randint(3, 6)
|
||||
else:
|
||||
logger.info(f"{self.log_prefix} 兴趣度充足")
|
||||
self.focus_energy = 1
|
||||
|
||||
async def _execute_no_reply(self, new_message:List[Dict[str, Any]]) -> Tuple[bool, str]:
|
||||
"""执行breaking形式的no_reply(原有逻辑)"""
|
||||
new_message_count = len(new_message)
|
||||
# 检查消息数量是否达到阈值
|
||||
talk_frequency = global_config.chat.get_current_talk_frequency(self.stream_id)
|
||||
modified_exit_count_threshold = self.focus_energy / talk_frequency
|
||||
|
||||
if new_message_count >= modified_exit_count_threshold:
|
||||
# 记录兴趣度到列表
|
||||
total_interest = 0.0
|
||||
for msg_dict in new_message:
|
||||
interest_value = msg_dict.get("interest_value", 0.0)
|
||||
if msg_dict.get("processed_plain_text", ""):
|
||||
total_interest += interest_value
|
||||
|
||||
NoReplyAction._recent_interest_records.append(total_interest)
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} 累计消息数量达到{new_message_count}条(>{modified_exit_count_threshold}),结束等待"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
# 检查累计兴趣值
|
||||
if new_message_count > 0:
|
||||
accumulated_interest = 0.0
|
||||
for msg_dict in new_message:
|
||||
text = msg_dict.get("processed_plain_text", "")
|
||||
interest_value = msg_dict.get("interest_value", 0.0)
|
||||
if text:
|
||||
accumulated_interest += interest_value
|
||||
|
||||
# 只在兴趣值变化时输出log
|
||||
if not hasattr(self, "_last_accumulated_interest") or accumulated_interest != self._last_accumulated_interest:
|
||||
logger.info(f"{self.log_prefix} breaking形式当前累计兴趣值: {accumulated_interest:.2f}, 当前聊天频率: {talk_frequency:.2f}")
|
||||
self._last_accumulated_interest = accumulated_interest
|
||||
|
||||
if accumulated_interest >= 3 / talk_frequency:
|
||||
# 记录兴趣度到列表
|
||||
NoReplyAction._recent_interest_records.append(accumulated_interest)
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} 累计兴趣值达到{accumulated_interest:.2f}(>{5 / talk_frequency}),结束等待"
|
||||
)
|
||||
return True
|
||||
|
||||
# 每10秒输出一次等待状态
|
||||
if int(time.time() - self.last_read_time) > 0 and int(time.time() - self.last_read_time) % 10 == 0:
|
||||
logger.info(
|
||||
f"{self.log_prefix} 已等待{time.time() - self.last_read_time:.0f}秒,累计{new_message_count}条消息,继续等待..."
|
||||
)
|
||||
|
||||
|
||||
async def _loopbody(self):
|
||||
recent_messages_dict = message_api.get_messages_by_time_in_chat(
|
||||
chat_id=self.stream_id,
|
||||
start_time=self.last_read_time,
|
||||
end_time=time.time(),
|
||||
limit = 10,
|
||||
limit_mode="latest",
|
||||
filter_mai=True,
|
||||
filter_command=True,
|
||||
)
|
||||
new_message_count = len(recent_messages_dict)
|
||||
|
||||
|
||||
if self.loop_mode == ChatMode.FOCUS:
|
||||
|
||||
if self.last_action == "no_reply":
|
||||
if not await self._execute_no_reply(recent_messages_dict):
|
||||
self.energy_value -= 0.3 / global_config.chat.focus_value
|
||||
logger.info(f"{self.log_prefix} 能量值减少,当前能量值:{self.energy_value:.1f}")
|
||||
await asyncio.sleep(0.5)
|
||||
return True
|
||||
|
||||
self.last_read_time = time.time()
|
||||
|
||||
if await self._observe():
|
||||
self.energy_value += 1 / global_config.chat.focus_value
|
||||
logger.info(f"{self.log_prefix} 能量值增加,当前能量值:{self.energy_value:.1f}")
|
||||
|
||||
if self.energy_value <= 1:
|
||||
self.energy_value = 1
|
||||
self.loop_mode = ChatMode.NORMAL
|
||||
return True
|
||||
|
||||
return True
|
||||
elif self.loop_mode == ChatMode.NORMAL:
|
||||
if global_config.chat.focus_value != 0:
|
||||
if new_message_count > 3 / pow(global_config.chat.focus_value, 0.5):
|
||||
self.loop_mode = ChatMode.FOCUS
|
||||
self.energy_value = (
|
||||
10 + (new_message_count / (3 / pow(global_config.chat.focus_value, 0.5))) * 10
|
||||
)
|
||||
return True
|
||||
|
||||
if self.energy_value >= 30:
|
||||
self.loop_mode = ChatMode.FOCUS
|
||||
return True
|
||||
|
||||
if new_message_count >= self.focus_energy:
|
||||
earliest_messages_data = recent_messages_dict[0]
|
||||
self.last_read_time = earliest_messages_data.get("time")
|
||||
|
||||
if_think = await self.normal_response(earliest_messages_data)
|
||||
if if_think:
|
||||
factor = max(global_config.chat.focus_value, 0.1)
|
||||
self.energy_value *= 1.1 * factor
|
||||
logger.info(f"{self.log_prefix} 进行了思考,能量值按倍数增加,当前能量值:{self.energy_value:.1f}")
|
||||
else:
|
||||
self.energy_value += 0.1 * global_config.chat.focus_value
|
||||
logger.debug(f"{self.log_prefix} 没有进行思考,能量值线性增加,当前能量值:{self.energy_value:.1f}")
|
||||
|
||||
logger.debug(f"{self.log_prefix} 当前能量值:{self.energy_value:.1f}")
|
||||
return True
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
return True
|
||||
|
||||
async def build_reply_to_str(self, message_data: dict):
|
||||
person_info_manager = get_person_info_manager()
|
||||
person_id = person_info_manager.get_person_id(
|
||||
message_data.get("chat_info_platform"), # type: ignore
|
||||
message_data.get("user_id"), # type: ignore
|
||||
)
|
||||
person_name = await person_info_manager.get_value(person_id, "person_name")
|
||||
return f"{person_name}:{message_data.get('processed_plain_text')}"
|
||||
|
||||
async def _send_and_store_reply(
|
||||
self,
|
||||
response_set,
|
||||
reply_to_str,
|
||||
loop_start_time,
|
||||
action_message,
|
||||
cycle_timers: Dict[str, float],
|
||||
thinking_id,
|
||||
plan_result,
|
||||
) -> Tuple[Dict[str, Any], str, Dict[str, float]]:
|
||||
with Timer("回复发送", cycle_timers):
|
||||
reply_text = await self._send_response(response_set, reply_to_str, loop_start_time, action_message)
|
||||
|
||||
# 存储reply action信息
|
||||
person_info_manager = get_person_info_manager()
|
||||
person_id = person_info_manager.get_person_id(
|
||||
action_message.get("chat_info_platform", ""),
|
||||
action_message.get("user_id", ""),
|
||||
)
|
||||
person_name = await person_info_manager.get_value(person_id, "person_name")
|
||||
action_prompt_display = f"你对{person_name}进行了回复:{reply_text}"
|
||||
|
||||
await database_api.store_action_info(
|
||||
chat_stream=self.chat_stream,
|
||||
action_build_into_prompt=False,
|
||||
action_prompt_display=action_prompt_display,
|
||||
action_done=True,
|
||||
thinking_id=thinking_id,
|
||||
action_data={"reply_text": reply_text, "reply_to": reply_to_str},
|
||||
action_name="reply",
|
||||
)
|
||||
|
||||
# 构建循环信息
|
||||
loop_info: Dict[str, Any] = {
|
||||
"loop_plan_info": {
|
||||
"action_result": plan_result.get("action_result", {}),
|
||||
},
|
||||
"loop_action_info": {
|
||||
"action_taken": True,
|
||||
"reply_text": reply_text,
|
||||
"command": "",
|
||||
"taken_time": time.time(),
|
||||
},
|
||||
}
|
||||
|
||||
return loop_info, reply_text, cycle_timers
|
||||
|
||||
async def _observe(self, message_data: Optional[Dict[str, Any]] = None) -> bool:
|
||||
if not message_data:
|
||||
message_data = {}
|
||||
action_type = "no_action"
|
||||
reply_text = "" # 初始化reply_text变量,避免UnboundLocalError
|
||||
gen_task = None # 初始化gen_task变量,避免UnboundLocalError
|
||||
reply_to_str = "" # 初始化reply_to_str变量
|
||||
|
||||
# 创建新的循环信息
|
||||
cycle_timers, thinking_id = self.start_cycle()
|
||||
|
||||
logger.info(f"{self.log_prefix} 开始第{self._cycle_counter}次思考[模式:{self.loop_mode}]")
|
||||
|
||||
if ENABLE_S4U:
|
||||
await send_typing()
|
||||
|
||||
async with global_prompt_manager.async_message_scope(self.chat_stream.context.get_template_name()):
|
||||
loop_start_time = time.time()
|
||||
await self.relationship_builder.build_relation()
|
||||
await self.expression_learner.trigger_learning_for_chat()
|
||||
|
||||
available_actions = {}
|
||||
|
||||
# 第一步:动作修改
|
||||
with Timer("动作修改", cycle_timers):
|
||||
try:
|
||||
await self.action_modifier.modify_actions()
|
||||
available_actions = self.action_manager.get_using_actions()
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 动作修改失败: {e}")
|
||||
|
||||
# 检查是否在normal模式下没有可用动作(除了reply相关动作)
|
||||
skip_planner = False
|
||||
if self.loop_mode == ChatMode.NORMAL:
|
||||
# 过滤掉reply相关的动作,检查是否还有其他动作
|
||||
non_reply_actions = {
|
||||
k: v for k, v in available_actions.items() if k not in ["reply", "no_reply", "no_action"]
|
||||
}
|
||||
|
||||
if not non_reply_actions:
|
||||
skip_planner = True
|
||||
logger.info(f"{self.log_prefix} Normal模式下没有可用动作,直接回复")
|
||||
|
||||
# 直接设置为reply动作
|
||||
action_type = "reply"
|
||||
reasoning = ""
|
||||
action_data = {"loop_start_time": loop_start_time}
|
||||
is_parallel = False
|
||||
|
||||
# 构建plan_result用于后续处理
|
||||
plan_result = {
|
||||
"action_result": {
|
||||
"action_type": action_type,
|
||||
"action_data": action_data,
|
||||
"reasoning": reasoning,
|
||||
"timestamp": time.time(),
|
||||
"is_parallel": is_parallel,
|
||||
},
|
||||
"action_prompt": "",
|
||||
}
|
||||
target_message = message_data
|
||||
|
||||
# 如果normal模式且不跳过规划器,开始一个回复生成进程,先准备好回复(其实是和planer同时进行的)
|
||||
if not skip_planner:
|
||||
reply_to_str = await self.build_reply_to_str(message_data)
|
||||
gen_task = asyncio.create_task(
|
||||
self._generate_response(
|
||||
message_data=message_data,
|
||||
available_actions=available_actions,
|
||||
reply_to=reply_to_str,
|
||||
request_type="chat.replyer.normal",
|
||||
)
|
||||
)
|
||||
|
||||
if not skip_planner:
|
||||
planner_info = self.action_planner.get_necessary_info()
|
||||
prompt_info = await self.action_planner.build_planner_prompt(
|
||||
is_group_chat=planner_info[0],
|
||||
chat_target_info=planner_info[1],
|
||||
current_available_actions=planner_info[2],
|
||||
)
|
||||
if not await events_manager.handle_mai_events(
|
||||
EventType.ON_PLAN, None, prompt_info[0], None, self.chat_stream.stream_id
|
||||
):
|
||||
return False
|
||||
with Timer("规划器", cycle_timers):
|
||||
plan_result, target_message = await self.action_planner.plan(mode=self.loop_mode)
|
||||
|
||||
action_result: Dict[str, Any] = plan_result.get("action_result", {}) # type: ignore
|
||||
action_type, action_data, reasoning, is_parallel = (
|
||||
action_result.get("action_type", "error"),
|
||||
action_result.get("action_data", {}),
|
||||
action_result.get("reasoning", "未提供理由"),
|
||||
action_result.get("is_parallel", True),
|
||||
)
|
||||
|
||||
action_data["loop_start_time"] = loop_start_time
|
||||
|
||||
if action_type == "reply":
|
||||
logger.info(f"{self.log_prefix}{global_config.bot.nickname} 决定进行回复")
|
||||
elif is_parallel:
|
||||
logger.info(f"{self.log_prefix}{global_config.bot.nickname} 决定进行回复, 同时执行{action_type}动作")
|
||||
else:
|
||||
# 只有在gen_task存在时才进行相关操作
|
||||
if gen_task:
|
||||
if not gen_task.done():
|
||||
gen_task.cancel()
|
||||
logger.debug(f"{self.log_prefix} 已取消预生成的回复任务")
|
||||
logger.info(
|
||||
f"{self.log_prefix}{global_config.bot.nickname} 原本想要回复,但选择执行{action_type},不发表回复"
|
||||
)
|
||||
elif generation_result := gen_task.result():
|
||||
content = " ".join([item[1] for item in generation_result if item[0] == "text"])
|
||||
logger.debug(f"{self.log_prefix} 预生成的回复任务已完成")
|
||||
logger.info(
|
||||
f"{self.log_prefix}{global_config.bot.nickname} 原本想要回复:{content},但选择执行{action_type},不发表回复"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix} 预生成的回复任务未生成有效内容")
|
||||
|
||||
action_message = message_data or target_message
|
||||
if action_type == "reply":
|
||||
# 等待回复生成完毕
|
||||
if self.loop_mode == ChatMode.NORMAL:
|
||||
# 只有在gen_task存在时才等待
|
||||
if not gen_task:
|
||||
reply_to_str = await self.build_reply_to_str(message_data)
|
||||
gen_task = asyncio.create_task(
|
||||
self._generate_response(
|
||||
message_data=message_data,
|
||||
available_actions=available_actions,
|
||||
reply_to=reply_to_str,
|
||||
request_type="chat.replyer.normal",
|
||||
)
|
||||
)
|
||||
|
||||
gather_timeout = global_config.chat.thinking_timeout
|
||||
try:
|
||||
response_set = await asyncio.wait_for(gen_task, timeout=gather_timeout)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"{self.log_prefix} 回复生成超时>{global_config.chat.thinking_timeout}s,已跳过")
|
||||
response_set = None
|
||||
|
||||
# 模型炸了或超时,没有回复内容生成
|
||||
if not response_set:
|
||||
logger.warning(f"{self.log_prefix}模型未生成回复内容")
|
||||
return False
|
||||
else:
|
||||
logger.info(f"{self.log_prefix}{global_config.bot.nickname} 决定进行回复 (focus模式)")
|
||||
|
||||
# 构建reply_to字符串
|
||||
reply_to_str = await self.build_reply_to_str(action_message)
|
||||
|
||||
# 生成回复
|
||||
with Timer("回复生成", cycle_timers):
|
||||
response_set = await self._generate_response(
|
||||
message_data=action_message,
|
||||
available_actions=available_actions,
|
||||
reply_to=reply_to_str,
|
||||
request_type="chat.replyer.focus",
|
||||
)
|
||||
|
||||
if not response_set:
|
||||
logger.warning(f"{self.log_prefix}模型未生成回复内容")
|
||||
return False
|
||||
|
||||
loop_info, reply_text, cycle_timers = await self._send_and_store_reply(
|
||||
response_set, reply_to_str, loop_start_time, action_message, cycle_timers, thinking_id, plan_result
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
# 并行执行:同时进行回复发送和动作执行
|
||||
# 先置空防止未定义错误
|
||||
background_reply_task = None
|
||||
background_action_task = None
|
||||
# 如果是并行执行且在normal模式下,需要等待预生成的回复任务完成并发送回复
|
||||
if self.loop_mode == ChatMode.NORMAL and is_parallel and gen_task:
|
||||
|
||||
async def handle_reply_task() -> Tuple[Optional[Dict[str, Any]], str, Dict[str, float]]:
|
||||
# 等待预生成的回复任务完成
|
||||
gather_timeout = global_config.chat.thinking_timeout
|
||||
try:
|
||||
response_set = await asyncio.wait_for(gen_task, timeout=gather_timeout)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
f"{self.log_prefix} 并行执行:回复生成超时>{global_config.chat.thinking_timeout}s,已跳过"
|
||||
)
|
||||
return None, "", {}
|
||||
except asyncio.CancelledError:
|
||||
logger.debug(f"{self.log_prefix} 并行执行:回复生成任务已被取消")
|
||||
return None, "", {}
|
||||
|
||||
if not response_set:
|
||||
logger.warning(f"{self.log_prefix} 模型超时或生成回复内容为空")
|
||||
return None, "", {}
|
||||
|
||||
reply_to_str = await self.build_reply_to_str(action_message)
|
||||
loop_info, reply_text, cycle_timers_reply = await self._send_and_store_reply(
|
||||
response_set,
|
||||
reply_to_str,
|
||||
loop_start_time,
|
||||
action_message,
|
||||
cycle_timers,
|
||||
thinking_id,
|
||||
plan_result,
|
||||
)
|
||||
return loop_info, reply_text, cycle_timers_reply
|
||||
|
||||
# 执行回复任务并赋值到变量
|
||||
background_reply_task = asyncio.create_task(handle_reply_task())
|
||||
|
||||
# 动作执行任务
|
||||
async def handle_action_task():
|
||||
with Timer("动作执行", cycle_timers):
|
||||
success, reply_text, command = await self._handle_action(
|
||||
action_type, reasoning, action_data, cycle_timers, thinking_id, action_message
|
||||
)
|
||||
return success, reply_text, command
|
||||
|
||||
# 执行动作任务并赋值到变量
|
||||
background_action_task = asyncio.create_task(handle_action_task())
|
||||
|
||||
reply_loop_info = None
|
||||
reply_text_from_reply = ""
|
||||
action_success = False
|
||||
action_reply_text = ""
|
||||
action_command = ""
|
||||
|
||||
# 并行执行所有任务
|
||||
if background_reply_task:
|
||||
results = await asyncio.gather(
|
||||
background_reply_task, background_action_task, return_exceptions=True
|
||||
)
|
||||
# 处理回复任务结果
|
||||
reply_result = results[0]
|
||||
if isinstance(reply_result, BaseException):
|
||||
logger.error(f"{self.log_prefix} 回复任务执行异常: {reply_result}")
|
||||
elif reply_result and reply_result[0] is not None:
|
||||
reply_loop_info, reply_text_from_reply, _ = reply_result
|
||||
|
||||
# 处理动作任务结果
|
||||
action_task_result = results[1]
|
||||
if isinstance(action_task_result, BaseException):
|
||||
logger.error(f"{self.log_prefix} 动作任务执行异常: {action_task_result}")
|
||||
else:
|
||||
action_success, action_reply_text, action_command = action_task_result
|
||||
else:
|
||||
results = await asyncio.gather(background_action_task, return_exceptions=True)
|
||||
# 只有动作任务
|
||||
action_task_result = results[0]
|
||||
if isinstance(action_task_result, BaseException):
|
||||
logger.error(f"{self.log_prefix} 动作任务执行异常: {action_task_result}")
|
||||
else:
|
||||
action_success, action_reply_text, action_command = action_task_result
|
||||
|
||||
# 构建最终的循环信息
|
||||
if reply_loop_info:
|
||||
# 如果有回复信息,使用回复的loop_info作为基础
|
||||
loop_info = reply_loop_info
|
||||
# 更新动作执行信息
|
||||
loop_info["loop_action_info"].update(
|
||||
{
|
||||
"action_taken": action_success,
|
||||
"command": action_command,
|
||||
"taken_time": time.time(),
|
||||
}
|
||||
)
|
||||
reply_text = reply_text_from_reply
|
||||
else:
|
||||
# 没有回复信息,构建纯动作的loop_info
|
||||
loop_info = {
|
||||
"loop_plan_info": {
|
||||
"action_result": plan_result.get("action_result", {}),
|
||||
},
|
||||
"loop_action_info": {
|
||||
"action_taken": action_success,
|
||||
"reply_text": action_reply_text,
|
||||
"command": action_command,
|
||||
"taken_time": time.time(),
|
||||
},
|
||||
}
|
||||
reply_text = action_reply_text
|
||||
|
||||
self.last_action = action_type
|
||||
|
||||
if ENABLE_S4U:
|
||||
await stop_typing()
|
||||
await mai_thinking_manager.get_mai_think(self.stream_id).do_think_after_response(reply_text)
|
||||
|
||||
self.end_cycle(loop_info, cycle_timers)
|
||||
self.print_cycle_info(cycle_timers)
|
||||
|
||||
if self.loop_mode == ChatMode.NORMAL:
|
||||
await self.willing_manager.after_generate_reply_handle(message_data.get("message_id", ""))
|
||||
|
||||
# 管理no_reply计数器:当执行了非no_reply动作时,重置计数器
|
||||
if action_type != "no_reply" and action_type != "no_action":
|
||||
# 导入NoReplyAction并重置计数器
|
||||
NoReplyAction.reset_consecutive_count()
|
||||
self.no_reply_consecutive = 0
|
||||
logger.info(f"{self.log_prefix} 执行了{action_type}动作,重置no_reply计数器")
|
||||
return True
|
||||
elif action_type == "no_action":
|
||||
# 当执行回复动作时,也重置no_reply计数
|
||||
NoReplyAction.reset_consecutive_count()
|
||||
self.no_reply_consecutive = 0
|
||||
logger.info(f"{self.log_prefix} 执行了回复动作,重置no_reply计数器")
|
||||
|
||||
if action_type == "no_reply":
|
||||
self.no_reply_consecutive += 1
|
||||
self._determine_form_type()
|
||||
|
||||
return True
|
||||
|
||||
async def _main_chat_loop(self):
|
||||
"""主循环,持续进行计划并可能回复消息,直到被外部取消。"""
|
||||
try:
|
||||
while self.running:
|
||||
# 主循环
|
||||
success = await self._loopbody()
|
||||
await asyncio.sleep(0.1)
|
||||
if not success:
|
||||
break
|
||||
except asyncio.CancelledError:
|
||||
# 设置了关闭标志位后被取消是正常流程
|
||||
logger.info(f"{self.log_prefix} 麦麦已关闭聊天")
|
||||
except Exception:
|
||||
logger.error(f"{self.log_prefix} 麦麦聊天意外错误,将于3s后尝试重新启动")
|
||||
print(traceback.format_exc())
|
||||
await asyncio.sleep(3)
|
||||
self._loop_task = asyncio.create_task(self._main_chat_loop())
|
||||
logger.error(f"{self.log_prefix} 结束了当前聊天循环")
|
||||
|
||||
async def _handle_action(
|
||||
self,
|
||||
action: str,
|
||||
reasoning: str,
|
||||
action_data: dict,
|
||||
cycle_timers: Dict[str, float],
|
||||
thinking_id: str,
|
||||
action_message: dict,
|
||||
) -> tuple[bool, str, str]:
|
||||
"""
|
||||
处理规划动作,使用动作工厂创建相应的动作处理器
|
||||
|
||||
参数:
|
||||
action: 动作类型
|
||||
reasoning: 决策理由
|
||||
action_data: 动作数据,包含不同动作需要的参数
|
||||
cycle_timers: 计时器字典
|
||||
thinking_id: 思考ID
|
||||
|
||||
返回:
|
||||
tuple[bool, str, str]: (是否执行了动作, 思考消息ID, 命令)
|
||||
"""
|
||||
try:
|
||||
# 使用工厂创建动作处理器实例
|
||||
try:
|
||||
action_handler = self.action_manager.create_action(
|
||||
action_name=action,
|
||||
action_data=action_data,
|
||||
reasoning=reasoning,
|
||||
cycle_timers=cycle_timers,
|
||||
thinking_id=thinking_id,
|
||||
chat_stream=self.chat_stream,
|
||||
log_prefix=self.log_prefix,
|
||||
action_message=action_message,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 创建动作处理器时出错: {e}")
|
||||
traceback.print_exc()
|
||||
return False, "", ""
|
||||
|
||||
if not action_handler:
|
||||
logger.warning(f"{self.log_prefix} 未能创建动作处理器: {action}")
|
||||
return False, "", ""
|
||||
|
||||
# 处理动作并获取结果
|
||||
result = await action_handler.handle_action()
|
||||
success, reply_text = result
|
||||
command = ""
|
||||
|
||||
if reply_text == "timeout":
|
||||
self.reply_timeout_count += 1
|
||||
if self.reply_timeout_count > 5:
|
||||
logger.warning(
|
||||
f"[{self.log_prefix} ] 连续回复超时次数过多,{global_config.chat.thinking_timeout}秒 内大模型没有返回有效内容,请检查你的api是否速度过慢或配置错误。建议不要使用推理模型,推理模型生成速度过慢。或者尝试拉高thinking_timeout参数,这可能导致回复时间过长。"
|
||||
)
|
||||
logger.warning(f"{self.log_prefix} 回复生成超时{global_config.chat.thinking_timeout}s,已跳过")
|
||||
return False, "", ""
|
||||
|
||||
return success, reply_text, command
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 处理{action}时出错: {e}")
|
||||
traceback.print_exc()
|
||||
return False, "", ""
|
||||
|
||||
async def normal_response(self, message_data: dict) -> bool:
|
||||
"""
|
||||
处理接收到的消息。
|
||||
在"兴趣"模式下,判断是否回复并生成内容。
|
||||
"""
|
||||
|
||||
interested_rate = message_data.get("interest_value") or 0.0
|
||||
|
||||
self.willing_manager.setup(message_data, self.chat_stream)
|
||||
|
||||
reply_probability = await self.willing_manager.get_reply_probability(message_data.get("message_id", ""))
|
||||
|
||||
talk_frequency = -1.00
|
||||
|
||||
if reply_probability < 1: # 简化逻辑,如果未提及 (reply_probability 为 0),则获取意愿概率
|
||||
additional_config = message_data.get("additional_config", {})
|
||||
if additional_config and "maimcore_reply_probability_gain" in additional_config:
|
||||
reply_probability += additional_config["maimcore_reply_probability_gain"]
|
||||
reply_probability = min(max(reply_probability, 0), 1) # 确保概率在 0-1 之间
|
||||
|
||||
talk_frequency = global_config.chat.get_current_talk_frequency(self.stream_id)
|
||||
reply_probability = talk_frequency * reply_probability
|
||||
|
||||
# 处理表情包
|
||||
if message_data.get("is_emoji") or message_data.get("is_picid"):
|
||||
reply_probability = 0
|
||||
|
||||
# 打印消息信息
|
||||
mes_name = self.chat_stream.group_info.group_name if self.chat_stream.group_info else "私聊"
|
||||
|
||||
# logger.info(f"[{mes_name}] 当前聊天频率: {talk_frequency:.2f},兴趣值: {interested_rate:.2f},回复概率: {reply_probability * 100:.1f}%")
|
||||
|
||||
if reply_probability > 0.05:
|
||||
logger.info(
|
||||
f"[{mes_name}]"
|
||||
f"{message_data.get('user_nickname')}:"
|
||||
f"{message_data.get('processed_plain_text')}[兴趣:{interested_rate:.2f}][回复概率:{reply_probability * 100:.1f}%]"
|
||||
)
|
||||
|
||||
if random.random() < reply_probability:
|
||||
await self.willing_manager.before_generate_reply_handle(message_data.get("message_id", ""))
|
||||
await self._observe(message_data=message_data)
|
||||
return True
|
||||
|
||||
# 意愿管理器:注销当前message信息 (无论是否回复,只要处理过就删除)
|
||||
self.willing_manager.delete(message_data.get("message_id", ""))
|
||||
return False
|
||||
|
||||
async def _generate_response(
|
||||
self,
|
||||
message_data: dict,
|
||||
available_actions: Optional[Dict[str, ActionInfo]],
|
||||
reply_to: str,
|
||||
request_type: str = "chat.replyer.normal",
|
||||
) -> Optional[list]:
|
||||
"""生成普通回复"""
|
||||
try:
|
||||
success, reply_set, _ = await generator_api.generate_reply(
|
||||
chat_stream=self.chat_stream,
|
||||
reply_to=reply_to,
|
||||
available_actions=available_actions,
|
||||
enable_tool=global_config.tool.enable_tool,
|
||||
request_type=request_type,
|
||||
from_plugin=False,
|
||||
)
|
||||
|
||||
if not success or not reply_set:
|
||||
logger.info(f"对 {message_data.get('processed_plain_text')} 的回复生成失败")
|
||||
return None
|
||||
|
||||
return reply_set
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix}回复生成出现错误:{str(e)} {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
async def _send_response(self, reply_set, reply_to, thinking_start_time, message_data) -> str:
|
||||
current_time = time.time()
|
||||
new_message_count = message_api.count_new_messages(
|
||||
chat_id=self.chat_stream.stream_id, start_time=thinking_start_time, end_time=current_time
|
||||
)
|
||||
platform = message_data.get("user_platform", "")
|
||||
user_id = message_data.get("user_id", "")
|
||||
reply_to_platform_id = f"{platform}:{user_id}"
|
||||
|
||||
need_reply = new_message_count >= random.randint(2, 4)
|
||||
|
||||
if need_reply:
|
||||
logger.info(f"{self.log_prefix} 从思考到回复,共有{new_message_count}条新消息,使用引用回复")
|
||||
else:
|
||||
logger.info(f"{self.log_prefix} 从思考到回复,共有{new_message_count}条新消息,不使用引用回复")
|
||||
|
||||
reply_text = ""
|
||||
first_replied = False
|
||||
for reply_seg in reply_set:
|
||||
data = reply_seg[1]
|
||||
if not first_replied:
|
||||
if need_reply:
|
||||
await send_api.text_to_stream(
|
||||
text=data,
|
||||
stream_id=self.chat_stream.stream_id,
|
||||
reply_to=reply_to,
|
||||
reply_to_platform_id=reply_to_platform_id,
|
||||
typing=False,
|
||||
)
|
||||
else:
|
||||
await send_api.text_to_stream(
|
||||
text=data,
|
||||
stream_id=self.chat_stream.stream_id,
|
||||
reply_to_platform_id=reply_to_platform_id,
|
||||
typing=False,
|
||||
)
|
||||
first_replied = True
|
||||
else:
|
||||
await send_api.text_to_stream(
|
||||
text=data,
|
||||
stream_id=self.chat_stream.stream_id,
|
||||
reply_to_platform_id=reply_to_platform_id,
|
||||
typing=True,
|
||||
)
|
||||
reply_text += data
|
||||
|
||||
return reply_text
|
||||
138
src/chat/chat_loop/hfc_utils.py
Normal file
138
src/chat/chat_loop/hfc_utils.py
Normal file
@@ -0,0 +1,138 @@
|
||||
import time
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from src.config.config import global_config
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.plugin_system.apis import send_api
|
||||
from maim_message.message_base import GroupInfo
|
||||
|
||||
from src.common.message_repository import count_messages
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class CycleDetail:
|
||||
"""循环信息记录类"""
|
||||
|
||||
def __init__(self, cycle_id: int):
|
||||
self.cycle_id = cycle_id
|
||||
self.thinking_id = ""
|
||||
self.start_time = time.time()
|
||||
self.end_time: Optional[float] = None
|
||||
self.timers: Dict[str, float] = {}
|
||||
|
||||
self.loop_plan_info: Dict[str, Any] = {}
|
||||
self.loop_action_info: Dict[str, Any] = {}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""将循环信息转换为字典格式"""
|
||||
|
||||
def convert_to_serializable(obj, depth=0, seen=None):
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
# 防止递归过深
|
||||
if depth > 5: # 降低递归深度限制
|
||||
return str(obj)
|
||||
|
||||
# 防止循环引用
|
||||
obj_id = id(obj)
|
||||
if obj_id in seen:
|
||||
return str(obj)
|
||||
seen.add(obj_id)
|
||||
|
||||
try:
|
||||
if hasattr(obj, "to_dict"):
|
||||
# 对于有to_dict方法的对象,直接调用其to_dict方法
|
||||
return obj.to_dict()
|
||||
elif isinstance(obj, dict):
|
||||
# 对于字典,只保留基本类型和可序列化的值
|
||||
return {
|
||||
k: convert_to_serializable(v, depth + 1, seen)
|
||||
for k, v in obj.items()
|
||||
if isinstance(k, (str, int, float, bool))
|
||||
}
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
# 对于列表和元组,只保留可序列化的元素
|
||||
return [
|
||||
convert_to_serializable(item, depth + 1, seen)
|
||||
for item in obj
|
||||
if not isinstance(item, (dict, list, tuple))
|
||||
or isinstance(item, (str, int, float, bool, type(None)))
|
||||
]
|
||||
elif isinstance(obj, (str, int, float, bool, type(None))):
|
||||
return obj
|
||||
else:
|
||||
return str(obj)
|
||||
finally:
|
||||
seen.remove(obj_id)
|
||||
|
||||
return {
|
||||
"cycle_id": self.cycle_id,
|
||||
"start_time": self.start_time,
|
||||
"end_time": self.end_time,
|
||||
"timers": self.timers,
|
||||
"thinking_id": self.thinking_id,
|
||||
"loop_plan_info": convert_to_serializable(self.loop_plan_info),
|
||||
"loop_action_info": convert_to_serializable(self.loop_action_info),
|
||||
}
|
||||
|
||||
def set_loop_info(self, loop_info: Dict[str, Any]):
|
||||
"""设置循环信息"""
|
||||
self.loop_plan_info = loop_info["loop_plan_info"]
|
||||
self.loop_action_info = loop_info["loop_action_info"]
|
||||
|
||||
|
||||
def get_recent_message_stats(minutes: float = 30, chat_id: Optional[str] = None) -> dict:
|
||||
"""
|
||||
Args:
|
||||
minutes (float): 检索的分钟数,默认30分钟
|
||||
chat_id (str, optional): 指定的chat_id,仅统计该chat下的消息。为None时统计全部。
|
||||
Returns:
|
||||
dict: {"bot_reply_count": int, "total_message_count": int}
|
||||
"""
|
||||
|
||||
now = time.time()
|
||||
start_time = now - minutes * 60
|
||||
bot_id = global_config.bot.qq_account
|
||||
|
||||
filter_base: Dict[str, Any] = {"time": {"$gte": start_time}}
|
||||
if chat_id is not None:
|
||||
filter_base["chat_id"] = chat_id
|
||||
|
||||
# 总消息数
|
||||
total_message_count = count_messages(filter_base)
|
||||
# bot自身回复数
|
||||
bot_filter = filter_base.copy()
|
||||
bot_filter["user_id"] = bot_id
|
||||
bot_reply_count = count_messages(bot_filter)
|
||||
|
||||
return {"bot_reply_count": bot_reply_count, "total_message_count": total_message_count}
|
||||
|
||||
|
||||
async def send_typing():
|
||||
group_info = GroupInfo(platform="amaidesu_default", group_id="114514", group_name="内心")
|
||||
|
||||
chat = await get_chat_manager().get_or_create_stream(
|
||||
platform="amaidesu_default",
|
||||
user_info=None,
|
||||
group_info=group_info,
|
||||
)
|
||||
|
||||
await send_api.custom_to_stream(
|
||||
message_type="state", content="typing", stream_id=chat.stream_id, storage_message=False
|
||||
)
|
||||
|
||||
async def stop_typing():
|
||||
group_info = GroupInfo(platform="amaidesu_default", group_id="114514", group_name="内心")
|
||||
|
||||
chat = await get_chat_manager().get_or_create_stream(
|
||||
platform="amaidesu_default",
|
||||
user_info=None,
|
||||
group_info=group_info,
|
||||
)
|
||||
|
||||
await send_api.custom_to_stream(
|
||||
message_type="state", content="stop_typing", stream_id=chat.stream_id, storage_message=False
|
||||
)
|
||||
1095
src/chat/emoji_system/emoji_manager.py
Normal file
1095
src/chat/emoji_system/emoji_manager.py
Normal file
File diff suppressed because it is too large
Load Diff
648
src/chat/express/expression_learner.py
Normal file
648
src/chat/express/expression_learner.py
Normal file
@@ -0,0 +1,648 @@
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
from typing import List, Dict, Optional, Any, Tuple
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.common.database.sqlalchemy_database_api import get_session
|
||||
from sqlalchemy import select
|
||||
from src.common.database.sqlalchemy_models import Expression
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import model_config, global_config
|
||||
from src.chat.utils.chat_message_builder import get_raw_msg_by_timestamp_with_chat_inclusive, build_anonymous_messages
|
||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
|
||||
|
||||
MAX_EXPRESSION_COUNT = 300
|
||||
DECAY_DAYS = 30 # 30天衰减到0.01
|
||||
DECAY_MIN = 0.01 # 最小衰减值
|
||||
|
||||
logger = get_logger("expressor")
|
||||
session = get_session()
|
||||
|
||||
def format_create_date(timestamp: float) -> str:
|
||||
"""
|
||||
将时间戳格式化为可读的日期字符串
|
||||
"""
|
||||
try:
|
||||
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
return "未知时间"
|
||||
|
||||
|
||||
def init_prompt() -> None:
|
||||
learn_style_prompt = """
|
||||
{chat_str}
|
||||
|
||||
请从上面这段群聊中概括除了人名为"SELF"之外的人的语言风格
|
||||
1. 只考虑文字,不要考虑表情包和图片
|
||||
2. 不要涉及具体的人名,只考虑语言风格
|
||||
3. 语言风格包含特殊内容和情感
|
||||
4. 思考有没有特殊的梗,一并总结成语言风格
|
||||
5. 例子仅供参考,请严格根据群聊内容总结!!!
|
||||
注意:总结成如下格式的规律,总结的内容要详细,但具有概括性:
|
||||
例如:当"AAAAA"时,可以"BBBBB", AAAAA代表某个具体的场景,不超过20个字。BBBBB代表对应的语言风格,特定句式或表达方式,不超过20个字。
|
||||
|
||||
例如:
|
||||
当"对某件事表示十分惊叹,有些意外"时,使用"我嘞个xxxx"
|
||||
当"表示讽刺的赞同,不想讲道理"时,使用"对对对"
|
||||
当"想说明某个具体的事实观点,但懒得明说,或者不便明说,或表达一种默契",使用"懂的都懂"
|
||||
当"当涉及游戏相关时,表示意外的夸赞,略带戏谑意味"时,使用"这么强!"
|
||||
|
||||
请注意:不要总结你自己(SELF)的发言
|
||||
现在请你概括
|
||||
"""
|
||||
Prompt(learn_style_prompt, "learn_style_prompt")
|
||||
|
||||
learn_grammar_prompt = """
|
||||
{chat_str}
|
||||
|
||||
请从上面这段群聊中概括除了人名为"SELF"之外的人的语法和句法特点,只考虑纯文字,不要考虑表情包和图片
|
||||
1.不要总结【图片】,【动画表情】,[图片],[动画表情],不总结 表情符号 at @ 回复 和[回复]
|
||||
2.不要涉及具体的人名,只考虑语法和句法特点,
|
||||
3.语法和句法特点要包括,句子长短(具体字数),有何种语病,如何拆分句子。
|
||||
4. 例子仅供参考,请严格根据群聊内容总结!!!
|
||||
总结成如下格式的规律,总结的内容要简洁,不浮夸:
|
||||
当"xxx"时,可以"xxx"
|
||||
|
||||
例如:
|
||||
当"表达观点较复杂"时,使用"省略主语(3-6个字)"的句法
|
||||
当"不用详细说明的一般表达"时,使用"非常简洁的句子"的句法
|
||||
当"需要单纯简单的确认"时,使用"单字或几个字的肯定(1-2个字)"的句法
|
||||
|
||||
注意不要总结你自己(SELF)的发言
|
||||
现在请你概括
|
||||
"""
|
||||
Prompt(learn_grammar_prompt, "learn_grammar_prompt")
|
||||
|
||||
|
||||
class ExpressionLearner:
|
||||
def __init__(self, chat_id: str) -> None:
|
||||
self.express_learn_model: LLMRequest = LLMRequest(
|
||||
model_set=model_config.model_task_config.replyer_1, request_type="expressor.learner"
|
||||
)
|
||||
self.chat_id = chat_id
|
||||
self.chat_name = get_chat_manager().get_stream_name(chat_id) or chat_id
|
||||
|
||||
|
||||
# 维护每个chat的上次学习时间
|
||||
self.last_learning_time: float = time.time()
|
||||
|
||||
# 学习参数
|
||||
self.min_messages_for_learning = 25 # 触发学习所需的最少消息数
|
||||
self.min_learning_interval = 300 # 最短学习时间间隔(秒)
|
||||
|
||||
|
||||
|
||||
|
||||
def can_learn_for_chat(self) -> bool:
|
||||
"""
|
||||
检查指定聊天流是否允许学习表达
|
||||
|
||||
Args:
|
||||
chat_id: 聊天流ID
|
||||
|
||||
Returns:
|
||||
bool: 是否允许学习
|
||||
"""
|
||||
try:
|
||||
use_expression, enable_learning, _ = global_config.expression.get_expression_config_for_chat(self.chat_id)
|
||||
return enable_learning
|
||||
except Exception as e:
|
||||
logger.error(f"检查学习权限失败: {e}")
|
||||
return False
|
||||
|
||||
def should_trigger_learning(self) -> bool:
|
||||
"""
|
||||
检查是否应该触发学习
|
||||
|
||||
Args:
|
||||
chat_id: 聊天流ID
|
||||
|
||||
Returns:
|
||||
bool: 是否应该触发学习
|
||||
"""
|
||||
current_time = time.time()
|
||||
|
||||
# 获取该聊天流的学习强度
|
||||
try:
|
||||
use_expression, enable_learning, learning_intensity = global_config.expression.get_expression_config_for_chat(self.chat_id)
|
||||
except Exception as e:
|
||||
logger.error(f"获取聊天流 {self.chat_id} 的学习配置失败: {e}")
|
||||
return False
|
||||
|
||||
# 检查是否允许学习
|
||||
if not enable_learning:
|
||||
return False
|
||||
|
||||
# 根据学习强度计算最短学习时间间隔
|
||||
min_interval = self.min_learning_interval / learning_intensity
|
||||
|
||||
# 检查时间间隔
|
||||
time_diff = current_time - self.last_learning_time
|
||||
if time_diff < min_interval:
|
||||
return False
|
||||
|
||||
# 检查消息数量(只检查指定聊天流的消息)
|
||||
recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive(
|
||||
chat_id=self.chat_id,
|
||||
timestamp_start=self.last_learning_time,
|
||||
timestamp_end=time.time(),
|
||||
)
|
||||
|
||||
if not recent_messages or len(recent_messages) < self.min_messages_for_learning:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def trigger_learning_for_chat(self) -> bool:
|
||||
"""
|
||||
为指定聊天流触发学习
|
||||
|
||||
Args:
|
||||
chat_id: 聊天流ID
|
||||
|
||||
Returns:
|
||||
bool: 是否成功触发学习
|
||||
"""
|
||||
if not self.should_trigger_learning():
|
||||
return False
|
||||
|
||||
try:
|
||||
logger.info(f"为聊天流 {self.chat_name} 触发表达学习")
|
||||
|
||||
# 学习语言风格
|
||||
learnt_style = await self.learn_and_store(type="style", num=25)
|
||||
|
||||
# 学习句法特点
|
||||
learnt_grammar = await self.learn_and_store(type="grammar", num=10)
|
||||
|
||||
# 更新学习时间
|
||||
self.last_learning_time = time.time()
|
||||
|
||||
if learnt_style or learnt_grammar:
|
||||
logger.info(f"聊天流 {self.chat_name} 表达学习完成")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"聊天流 {self.chat_name} 表达学习未获得有效结果")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"为聊天流 {self.chat_name} 触发学习失败: {e}")
|
||||
return False
|
||||
|
||||
def get_expression_by_chat_id(self) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
|
||||
"""
|
||||
获取指定chat_id的style和grammar表达方式
|
||||
返回的每个表达方式字典中都包含了source_id, 用于后续的更新操作
|
||||
"""
|
||||
learnt_style_expressions = []
|
||||
learnt_grammar_expressions = []
|
||||
|
||||
# 直接从数据库查询
|
||||
style_query = session.execute(select(Expression).where((Expression.chat_id == self.chat_id) & (Expression.type == "style")))
|
||||
for expr in style_query.scalars():
|
||||
# 确保create_date存在,如果不存在则使用last_active_time
|
||||
create_date = expr.create_date if expr.create_date is not None else expr.last_active_time
|
||||
learnt_style_expressions.append(
|
||||
{
|
||||
"situation": expr.situation,
|
||||
"style": expr.style,
|
||||
"count": expr.count,
|
||||
"last_active_time": expr.last_active_time,
|
||||
"source_id": self.chat_id,
|
||||
"type": "style",
|
||||
"create_date": create_date,
|
||||
}
|
||||
)
|
||||
grammar_query = session.execute(select(Expression).where((Expression.chat_id == self.chat_id) & (Expression.type == "grammar")))
|
||||
for expr in grammar_query.scalars():
|
||||
# 确保create_date存在,如果不存在则使用last_active_time
|
||||
create_date = expr.create_date if expr.create_date is not None else expr.last_active_time
|
||||
learnt_grammar_expressions.append(
|
||||
{
|
||||
"situation": expr.situation,
|
||||
"style": expr.style,
|
||||
"count": expr.count,
|
||||
"last_active_time": expr.last_active_time,
|
||||
"source_id": self.chat_id,
|
||||
"type": "grammar",
|
||||
"create_date": create_date,
|
||||
}
|
||||
)
|
||||
return learnt_style_expressions, learnt_grammar_expressions
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def _apply_global_decay_to_database(self, current_time: float) -> None:
|
||||
"""
|
||||
对数据库中的所有表达方式应用全局衰减
|
||||
"""
|
||||
try:
|
||||
# 获取所有表达方式
|
||||
all_expressions = session.execute(select(Expression)).scalars()
|
||||
|
||||
updated_count = 0
|
||||
deleted_count = 0
|
||||
|
||||
for expr in all_expressions:
|
||||
# 计算时间差
|
||||
last_active = expr.last_active_time
|
||||
time_diff_days = (current_time - last_active) / (24 * 3600) # 转换为天
|
||||
|
||||
# 计算衰减值
|
||||
decay_value = self.calculate_decay_factor(time_diff_days)
|
||||
new_count = max(0.01, expr.count - decay_value)
|
||||
|
||||
if new_count <= 0.01:
|
||||
# 如果count太小,删除这个表达方式
|
||||
session.delete(expr)
|
||||
deleted_count += 1
|
||||
else:
|
||||
# 更新count
|
||||
expr.count = new_count
|
||||
updated_count += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
if updated_count > 0 or deleted_count > 0:
|
||||
logger.info(f"全局衰减完成:更新了 {updated_count} 个表达方式,删除了 {deleted_count} 个表达方式")
|
||||
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"数据库全局衰减失败: {e}")
|
||||
|
||||
def calculate_decay_factor(self, time_diff_days: float) -> float:
|
||||
"""
|
||||
计算衰减值
|
||||
当时间差为0天时,衰减值为0(最近活跃的不衰减)
|
||||
当时间差为7天时,衰减值为0.002(中等衰减)
|
||||
当时间差为30天或更长时,衰减值为0.01(高衰减)
|
||||
使用二次函数进行曲线插值
|
||||
"""
|
||||
if time_diff_days <= 0:
|
||||
return 0.0 # 刚激活的表达式不衰减
|
||||
|
||||
if time_diff_days >= DECAY_DAYS:
|
||||
return 0.01 # 长时间未活跃的表达式大幅衰减
|
||||
|
||||
# 使用二次函数插值:在0-30天之间从0衰减到0.01
|
||||
# 使用简单的二次函数:y = a * x^2
|
||||
# 当x=30时,y=0.01,所以 a = 0.01 / (30^2) = 0.01 / 900
|
||||
a = 0.01 / (DECAY_DAYS**2)
|
||||
decay = a * (time_diff_days**2)
|
||||
|
||||
return min(0.01, decay)
|
||||
|
||||
async def learn_and_store(self, type: str, num: int = 10) -> List[Tuple[str, str, str]]:
|
||||
# sourcery skip: use-join
|
||||
"""
|
||||
学习并存储表达方式
|
||||
type: "style" or "grammar"
|
||||
"""
|
||||
if type == "style":
|
||||
type_str = "语言风格"
|
||||
elif type == "grammar":
|
||||
type_str = "句法特点"
|
||||
else:
|
||||
raise ValueError(f"Invalid type: {type}")
|
||||
|
||||
# 检查是否允许在此聊天流中学习(在函数最前面检查)
|
||||
if not self.can_learn_for_chat():
|
||||
logger.debug(f"聊天流 {self.chat_name} 不允许学习表达,跳过学习")
|
||||
return []
|
||||
|
||||
res = await self.learn_expression(type, num)
|
||||
|
||||
if res is None:
|
||||
return []
|
||||
learnt_expressions, chat_id = res
|
||||
|
||||
chat_stream = get_chat_manager().get_stream(chat_id)
|
||||
if chat_stream is None:
|
||||
group_name = f"聊天流 {chat_id}"
|
||||
elif chat_stream.group_info:
|
||||
group_name = chat_stream.group_info.group_name
|
||||
else:
|
||||
group_name = f"{chat_stream.user_info.user_nickname}的私聊"
|
||||
learnt_expressions_str = ""
|
||||
for _chat_id, situation, style in learnt_expressions:
|
||||
learnt_expressions_str += f"{situation}->{style}\n"
|
||||
logger.info(f"在 {group_name} 学习到{type_str}:\n{learnt_expressions_str}")
|
||||
|
||||
if not learnt_expressions:
|
||||
logger.info(f"没有学习到{type_str}")
|
||||
return []
|
||||
|
||||
# 按chat_id分组
|
||||
chat_dict: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for chat_id, situation, style in learnt_expressions:
|
||||
if chat_id not in chat_dict:
|
||||
chat_dict[chat_id] = []
|
||||
chat_dict[chat_id].append({"situation": situation, "style": style})
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
# 存储到数据库 Expression 表
|
||||
for chat_id, expr_list in chat_dict.items():
|
||||
for new_expr in expr_list:
|
||||
# 查找是否已存在相似表达方式
|
||||
query = session.execute(select(Expression).where(
|
||||
(Expression.chat_id == chat_id)
|
||||
& (Expression.type == type)
|
||||
& (Expression.situation == new_expr["situation"])
|
||||
& (Expression.style == new_expr["style"])
|
||||
)).scalar()
|
||||
if query:
|
||||
expr_obj = query
|
||||
# 50%概率替换内容
|
||||
if random.random() < 0.5:
|
||||
expr_obj.situation = new_expr["situation"]
|
||||
expr_obj.style = new_expr["style"]
|
||||
expr_obj.count = expr_obj.count + 1
|
||||
expr_obj.last_active_time = current_time
|
||||
else:
|
||||
new_expression = Expression(
|
||||
situation=new_expr["situation"],
|
||||
style=new_expr["style"],
|
||||
count=1,
|
||||
last_active_time=current_time,
|
||||
chat_id=chat_id,
|
||||
type=type,
|
||||
create_date=current_time, # 手动设置创建日期
|
||||
)
|
||||
session.add(new_expression)
|
||||
# 限制最大数量
|
||||
exprs = list(
|
||||
session.execute(select(Expression)
|
||||
.where((Expression.chat_id == chat_id) & (Expression.type == type))
|
||||
.order_by(Expression.count.asc())).scalars()
|
||||
)
|
||||
if len(exprs) > MAX_EXPRESSION_COUNT:
|
||||
# 删除count最小的多余表达方式
|
||||
for expr in exprs[: len(exprs) - MAX_EXPRESSION_COUNT]:
|
||||
session.delete(expr)
|
||||
session.commit()
|
||||
return learnt_expressions
|
||||
|
||||
async def learn_expression(self, type: str, num: int = 10) -> Optional[Tuple[List[Tuple[str, str, str]], str]]:
|
||||
"""从指定聊天流学习表达方式
|
||||
|
||||
Args:
|
||||
type: "style" or "grammar"
|
||||
"""
|
||||
if type == "style":
|
||||
type_str = "语言风格"
|
||||
prompt = "learn_style_prompt"
|
||||
elif type == "grammar":
|
||||
type_str = "句法特点"
|
||||
prompt = "learn_grammar_prompt"
|
||||
else:
|
||||
raise ValueError(f"Invalid type: {type}")
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
# 获取上次学习时间
|
||||
random_msg: Optional[List[Dict[str, Any]]] = get_raw_msg_by_timestamp_with_chat_inclusive(
|
||||
chat_id=self.chat_id,
|
||||
timestamp_start=self.last_learning_time,
|
||||
timestamp_end=current_time,
|
||||
limit=num,
|
||||
)
|
||||
|
||||
# print(random_msg)
|
||||
if not random_msg or random_msg == []:
|
||||
return None
|
||||
# 转化成str
|
||||
chat_id: str = random_msg[0]["chat_id"]
|
||||
# random_msg_str: str = build_readable_messages(random_msg, timestamp_mode="normal")
|
||||
random_msg_str: str = await build_anonymous_messages(random_msg)
|
||||
# print(f"random_msg_str:{random_msg_str}")
|
||||
|
||||
prompt: str = await global_prompt_manager.format_prompt(
|
||||
prompt,
|
||||
chat_str=random_msg_str,
|
||||
)
|
||||
|
||||
logger.debug(f"学习{type_str}的prompt: {prompt}")
|
||||
|
||||
try:
|
||||
response, _ = await self.express_learn_model.generate_response_async(prompt, temperature=0.3)
|
||||
except Exception as e:
|
||||
logger.error(f"学习{type_str}失败: {e}")
|
||||
return None
|
||||
|
||||
logger.debug(f"学习{type_str}的response: {response}")
|
||||
|
||||
expressions: List[Tuple[str, str, str]] = self.parse_expression_response(response, chat_id)
|
||||
|
||||
return expressions, chat_id
|
||||
|
||||
def parse_expression_response(self, response: str, chat_id: str) -> List[Tuple[str, str, str]]:
|
||||
"""
|
||||
解析LLM返回的表达风格总结,每一行提取"当"和"使用"之间的内容,存储为(situation, style)元组
|
||||
"""
|
||||
expressions: List[Tuple[str, str, str]] = []
|
||||
for line in response.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# 查找"当"和下一个引号
|
||||
idx_when = line.find('当"')
|
||||
if idx_when == -1:
|
||||
continue
|
||||
idx_quote1 = idx_when + 1
|
||||
idx_quote2 = line.find('"', idx_quote1 + 1)
|
||||
if idx_quote2 == -1:
|
||||
continue
|
||||
situation = line[idx_quote1 + 1 : idx_quote2]
|
||||
# 查找"使用"
|
||||
idx_use = line.find('使用"', idx_quote2)
|
||||
if idx_use == -1:
|
||||
continue
|
||||
idx_quote3 = idx_use + 2
|
||||
idx_quote4 = line.find('"', idx_quote3 + 1)
|
||||
if idx_quote4 == -1:
|
||||
continue
|
||||
style = line[idx_quote3 + 1 : idx_quote4]
|
||||
expressions.append((chat_id, situation, style))
|
||||
return expressions
|
||||
|
||||
|
||||
init_prompt()
|
||||
|
||||
class ExpressionLearnerManager:
|
||||
def __init__(self):
|
||||
self.expression_learners = {}
|
||||
|
||||
self._ensure_expression_directories()
|
||||
self._auto_migrate_json_to_db()
|
||||
self._migrate_old_data_create_date()
|
||||
|
||||
def get_expression_learner(self, chat_id: str) -> ExpressionLearner:
|
||||
if chat_id not in self.expression_learners:
|
||||
self.expression_learners[chat_id] = ExpressionLearner(chat_id)
|
||||
return self.expression_learners[chat_id]
|
||||
|
||||
def _ensure_expression_directories(self):
|
||||
"""
|
||||
确保表达方式相关的目录结构存在
|
||||
"""
|
||||
base_dir = os.path.join("data", "expression")
|
||||
directories_to_create = [
|
||||
base_dir,
|
||||
os.path.join(base_dir, "learnt_style"),
|
||||
os.path.join(base_dir, "learnt_grammar"),
|
||||
]
|
||||
|
||||
for directory in directories_to_create:
|
||||
try:
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
logger.debug(f"确保目录存在: {directory}")
|
||||
except Exception as e:
|
||||
logger.error(f"创建目录失败 {directory}: {e}")
|
||||
|
||||
|
||||
def _auto_migrate_json_to_db(self):
|
||||
"""
|
||||
自动将/data/expression/learnt_style 和 learnt_grammar 下所有expressions.json迁移到数据库。
|
||||
迁移完成后在/data/expression/done.done写入标记文件,存在则跳过。
|
||||
"""
|
||||
base_dir = os.path.join("data", "expression")
|
||||
done_flag = os.path.join(base_dir, "done.done")
|
||||
|
||||
# 确保基础目录存在
|
||||
try:
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
logger.debug(f"确保目录存在: {base_dir}")
|
||||
except Exception as e:
|
||||
logger.error(f"创建表达方式目录失败: {e}")
|
||||
return
|
||||
|
||||
if os.path.exists(done_flag):
|
||||
logger.info("表达方式JSON已迁移,无需重复迁移。")
|
||||
return
|
||||
|
||||
logger.info("开始迁移表达方式JSON到数据库...")
|
||||
migrated_count = 0
|
||||
|
||||
for type in ["learnt_style", "learnt_grammar"]:
|
||||
type_str = "style" if type == "learnt_style" else "grammar"
|
||||
type_dir = os.path.join(base_dir, type)
|
||||
if not os.path.exists(type_dir):
|
||||
logger.debug(f"目录不存在,跳过: {type_dir}")
|
||||
continue
|
||||
|
||||
try:
|
||||
chat_ids = os.listdir(type_dir)
|
||||
logger.debug(f"在 {type_dir} 中找到 {len(chat_ids)} 个聊天ID目录")
|
||||
except Exception as e:
|
||||
logger.error(f"读取目录失败 {type_dir}: {e}")
|
||||
continue
|
||||
|
||||
for chat_id in chat_ids:
|
||||
expr_file = os.path.join(type_dir, chat_id, "expressions.json")
|
||||
if not os.path.exists(expr_file):
|
||||
continue
|
||||
try:
|
||||
with open(expr_file, "r", encoding="utf-8") as f:
|
||||
expressions = json.load(f)
|
||||
|
||||
if not isinstance(expressions, list):
|
||||
logger.warning(f"表达方式文件格式错误,跳过: {expr_file}")
|
||||
continue
|
||||
|
||||
for expr in expressions:
|
||||
if not isinstance(expr, dict):
|
||||
continue
|
||||
|
||||
situation = expr.get("situation")
|
||||
style_val = expr.get("style")
|
||||
count = expr.get("count", 1)
|
||||
last_active_time = expr.get("last_active_time", time.time())
|
||||
|
||||
if not situation or not style_val:
|
||||
logger.warning(f"表达方式缺少必要字段,跳过: {expr}")
|
||||
continue
|
||||
|
||||
# 查重:同chat_id+type+situation+style
|
||||
|
||||
query = session.execute(select(Expression).where(
|
||||
(Expression.chat_id == chat_id)
|
||||
& (Expression.type == type_str)
|
||||
& (Expression.situation == situation)
|
||||
& (Expression.style == style_val)
|
||||
)).scalar()
|
||||
if query:
|
||||
expr_obj = query
|
||||
expr_obj.count = max(expr_obj.count, count)
|
||||
expr_obj.last_active_time = max(expr_obj.last_active_time, last_active_time)
|
||||
else:
|
||||
new_expression = Expression(
|
||||
situation=situation,
|
||||
style=style_val,
|
||||
count=count,
|
||||
last_active_time=last_active_time,
|
||||
chat_id=chat_id,
|
||||
type=type_str,
|
||||
create_date=last_active_time, # 迁移时使用last_active_time作为创建时间
|
||||
)
|
||||
session.add(new_expression)
|
||||
migrated_count += 1
|
||||
logger.info(f"已迁移 {expr_file} 到数据库,包含 {len(expressions)} 个表达方式")
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"JSON解析失败 {expr_file}: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"迁移表达方式 {expr_file} 失败: {e}")
|
||||
|
||||
# 标记迁移完成
|
||||
try:
|
||||
# 确保done.done文件的父目录存在
|
||||
done_parent_dir = os.path.dirname(done_flag)
|
||||
if not os.path.exists(done_parent_dir):
|
||||
os.makedirs(done_parent_dir, exist_ok=True)
|
||||
logger.debug(f"为done.done创建父目录: {done_parent_dir}")
|
||||
|
||||
with open(done_flag, "w", encoding="utf-8") as f:
|
||||
f.write("done\n")
|
||||
logger.info(f"表达方式JSON迁移已完成,共迁移 {migrated_count} 个表达方式,已写入done.done标记文件")
|
||||
except PermissionError as e:
|
||||
logger.error(f"权限不足,无法写入done.done标记文件: {e}")
|
||||
except OSError as e:
|
||||
logger.error(f"文件系统错误,无法写入done.done标记文件: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"写入done.done标记文件失败: {e}")
|
||||
|
||||
def _migrate_old_data_create_date(self):
|
||||
"""
|
||||
为没有create_date的老数据设置创建日期
|
||||
使用last_active_time作为create_date的默认值
|
||||
"""
|
||||
try:
|
||||
# 查找所有create_date为空的表达方式
|
||||
old_expressions = session.execute(select(Expression).where(Expression.create_date.is_(None))).scalars()
|
||||
updated_count = 0
|
||||
|
||||
for expr in old_expressions:
|
||||
# 使用last_active_time作为create_date
|
||||
expr.create_date = expr.last_active_time
|
||||
updated_count += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
if updated_count > 0:
|
||||
logger.info(f"已为 {updated_count} 个老的表达方式设置创建日期")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logger.error(f"迁移老数据创建日期失败: {e}")
|
||||
|
||||
|
||||
expression_learner_manager = ExpressionLearnerManager()
|
||||
339
src/chat/express/expression_selector.py
Normal file
339
src/chat/express/expression_selector.py
Normal file
@@ -0,0 +1,339 @@
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
from json_repair import repair_json
|
||||
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import global_config, model_config
|
||||
from src.common.logger import get_logger
|
||||
from sqlalchemy import select
|
||||
from src.common.database.sqlalchemy_models import Expression
|
||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||
from src.common.database.sqlalchemy_database_api import get_session
|
||||
session = get_session()
|
||||
|
||||
logger = get_logger("expression_selector")
|
||||
|
||||
|
||||
def init_prompt():
|
||||
expression_evaluation_prompt = """
|
||||
以下是正在进行的聊天内容:
|
||||
{chat_observe_info}
|
||||
|
||||
你的名字是{bot_name}{target_message}
|
||||
|
||||
以下是可选的表达情境:
|
||||
{all_situations}
|
||||
|
||||
请你分析聊天内容的语境、情绪、话题类型,从上述情境中选择最适合当前聊天情境的{min_num}-{max_num}个情境。
|
||||
考虑因素包括:
|
||||
1. 聊天的情绪氛围(轻松、严肃、幽默等)
|
||||
2. 话题类型(日常、技术、游戏、情感等)
|
||||
3. 情境与当前语境的匹配度
|
||||
{target_message_extra_block}
|
||||
|
||||
请以JSON格式输出,只需要输出选中的情境编号:
|
||||
例如:
|
||||
{{
|
||||
"selected_situations": [2, 3, 5, 7, 19, 22, 25, 38, 39, 45, 48, 64]
|
||||
}}
|
||||
|
||||
请严格按照JSON格式输出,不要包含其他内容:
|
||||
"""
|
||||
Prompt(expression_evaluation_prompt, "expression_evaluation_prompt")
|
||||
|
||||
|
||||
def weighted_sample(population: List[Dict], weights: List[float], k: int) -> List[Dict]:
|
||||
"""按权重随机抽样"""
|
||||
if not population or not weights or k <= 0:
|
||||
return []
|
||||
|
||||
if len(population) <= k:
|
||||
return population.copy()
|
||||
|
||||
# 使用累积权重的方法进行加权抽样
|
||||
selected = []
|
||||
population_copy = population.copy()
|
||||
weights_copy = weights.copy()
|
||||
|
||||
for _ in range(k):
|
||||
if not population_copy:
|
||||
break
|
||||
|
||||
# 选择一个元素
|
||||
chosen_idx = random.choices(range(len(population_copy)), weights=weights_copy)[0]
|
||||
selected.append(population_copy.pop(chosen_idx))
|
||||
weights_copy.pop(chosen_idx)
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
class ExpressionSelector:
|
||||
def __init__(self):
|
||||
self.llm_model = LLMRequest(
|
||||
model_set=model_config.model_task_config.utils_small, request_type="expression.selector"
|
||||
)
|
||||
|
||||
def can_use_expression_for_chat(self, chat_id: str) -> bool:
|
||||
"""
|
||||
检查指定聊天流是否允许使用表达
|
||||
|
||||
Args:
|
||||
chat_id: 聊天流ID
|
||||
|
||||
Returns:
|
||||
bool: 是否允许使用表达
|
||||
"""
|
||||
try:
|
||||
use_expression, _, _ = global_config.expression.get_expression_config_for_chat(chat_id)
|
||||
return use_expression
|
||||
except Exception as e:
|
||||
logger.error(f"检查表达使用权限失败: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _parse_stream_config_to_chat_id(stream_config_str: str) -> Optional[str]:
|
||||
"""解析'platform:id:type'为chat_id(与get_stream_id一致)"""
|
||||
try:
|
||||
parts = stream_config_str.split(":")
|
||||
if len(parts) != 3:
|
||||
return None
|
||||
platform = parts[0]
|
||||
id_str = parts[1]
|
||||
stream_type = parts[2]
|
||||
is_group = stream_type == "group"
|
||||
if is_group:
|
||||
components = [platform, str(id_str)]
|
||||
else:
|
||||
components = [platform, str(id_str), "private"]
|
||||
key = "_".join(components)
|
||||
return hashlib.md5(key.encode()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def get_related_chat_ids(self, chat_id: str) -> List[str]:
|
||||
"""根据expression_groups配置,获取与当前chat_id相关的所有chat_id(包括自身)"""
|
||||
groups = global_config.expression.expression_groups
|
||||
for group in groups:
|
||||
group_chat_ids = []
|
||||
for stream_config_str in group:
|
||||
if chat_id_candidate := self._parse_stream_config_to_chat_id(stream_config_str):
|
||||
group_chat_ids.append(chat_id_candidate)
|
||||
if chat_id in group_chat_ids:
|
||||
return group_chat_ids
|
||||
return [chat_id]
|
||||
|
||||
def get_random_expressions(
|
||||
self, chat_id: str, total_num: int, style_percentage: float, grammar_percentage: float
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
# sourcery skip: extract-duplicate-method, move-assign
|
||||
# 支持多chat_id合并抽选
|
||||
related_chat_ids = self.get_related_chat_ids(chat_id)
|
||||
|
||||
# 优化:一次性查询所有相关chat_id的表达方式
|
||||
style_query = session.execute(select(Expression).where(
|
||||
(Expression.chat_id.in_(related_chat_ids)) & (Expression.type == "style")
|
||||
))
|
||||
grammar_query = session.execute(select(Expression).where(
|
||||
(Expression.chat_id.in_(related_chat_ids)) & (Expression.type == "grammar")
|
||||
))
|
||||
|
||||
style_exprs = [
|
||||
{
|
||||
"situation": expr.situation,
|
||||
"style": expr.style,
|
||||
"count": expr.count,
|
||||
"last_active_time": expr.last_active_time,
|
||||
"source_id": expr.chat_id,
|
||||
"type": "style",
|
||||
"create_date": expr.create_date if expr.create_date is not None else expr.last_active_time,
|
||||
}
|
||||
for expr in style_query.scalars()
|
||||
]
|
||||
|
||||
grammar_exprs = [
|
||||
{
|
||||
"situation": expr.situation,
|
||||
"style": expr.style,
|
||||
"count": expr.count,
|
||||
"last_active_time": expr.last_active_time,
|
||||
"source_id": expr.chat_id,
|
||||
"type": "grammar",
|
||||
"create_date": expr.create_date if expr.create_date is not None else expr.last_active_time,
|
||||
}
|
||||
for expr in grammar_query.scalars()
|
||||
]
|
||||
|
||||
style_num = int(total_num * style_percentage)
|
||||
grammar_num = int(total_num * grammar_percentage)
|
||||
# 按权重抽样(使用count作为权重)
|
||||
if style_exprs:
|
||||
style_weights = [expr.get("count", 1) for expr in style_exprs]
|
||||
selected_style = weighted_sample(style_exprs, style_weights, style_num)
|
||||
else:
|
||||
selected_style = []
|
||||
if grammar_exprs:
|
||||
grammar_weights = [expr.get("count", 1) for expr in grammar_exprs]
|
||||
selected_grammar = weighted_sample(grammar_exprs, grammar_weights, grammar_num)
|
||||
else:
|
||||
selected_grammar = []
|
||||
return selected_style, selected_grammar
|
||||
|
||||
def update_expressions_count_batch(self, expressions_to_update: List[Dict[str, Any]], increment: float = 0.1):
|
||||
"""对一批表达方式更新count值,按chat_id+type分组后一次性写入数据库"""
|
||||
if not expressions_to_update:
|
||||
return
|
||||
updates_by_key = {}
|
||||
for expr in expressions_to_update:
|
||||
source_id: str = expr.get("source_id") # type: ignore
|
||||
expr_type: str = expr.get("type", "style")
|
||||
situation: str = expr.get("situation") # type: ignore
|
||||
style: str = expr.get("style") # type: ignore
|
||||
if not source_id or not situation or not style:
|
||||
logger.warning(f"表达方式缺少必要字段,无法更新: {expr}")
|
||||
continue
|
||||
key = (source_id, expr_type, situation, style)
|
||||
if key not in updates_by_key:
|
||||
updates_by_key[key] = expr
|
||||
for chat_id, expr_type, situation, style in updates_by_key:
|
||||
query = session.execute(select(Expression).where(
|
||||
(Expression.chat_id == chat_id)
|
||||
& (Expression.type == expr_type)
|
||||
& (Expression.situation == situation)
|
||||
& (Expression.style == style)
|
||||
)).scalar()
|
||||
if query:
|
||||
expr_obj = query
|
||||
current_count = expr_obj.count
|
||||
new_count = min(current_count + increment, 5.0)
|
||||
expr_obj.count = new_count
|
||||
expr_obj.last_active_time = time.time()
|
||||
session.commit()
|
||||
logger.debug(
|
||||
f"表达方式激活: 原count={current_count:.3f}, 增量={increment}, 新count={new_count:.3f} in db"
|
||||
)
|
||||
|
||||
async def select_suitable_expressions_llm(
|
||||
self,
|
||||
chat_id: str,
|
||||
chat_info: str,
|
||||
max_num: int = 10,
|
||||
min_num: int = 5,
|
||||
target_message: Optional[str] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
# sourcery skip: inline-variable, list-comprehension
|
||||
"""使用LLM选择适合的表达方式"""
|
||||
|
||||
# 检查是否允许在此聊天流中使用表达
|
||||
if not self.can_use_expression_for_chat(chat_id):
|
||||
logger.debug(f"聊天流 {chat_id} 不允许使用表达,返回空列表")
|
||||
return []
|
||||
|
||||
# 1. 获取35个随机表达方式(现在按权重抽取)
|
||||
style_exprs, grammar_exprs = self.get_random_expressions(chat_id, 30, 0.5, 0.5)
|
||||
|
||||
# 2. 构建所有表达方式的索引和情境列表
|
||||
all_expressions = []
|
||||
all_situations = []
|
||||
|
||||
# 添加style表达方式
|
||||
for expr in style_exprs:
|
||||
if isinstance(expr, dict) and "situation" in expr and "style" in expr:
|
||||
expr_with_type = expr.copy()
|
||||
expr_with_type["type"] = "style"
|
||||
all_expressions.append(expr_with_type)
|
||||
all_situations.append(f"{len(all_expressions)}.{expr['situation']}")
|
||||
|
||||
# 添加grammar表达方式
|
||||
for expr in grammar_exprs:
|
||||
if isinstance(expr, dict) and "situation" in expr and "style" in expr:
|
||||
expr_with_type = expr.copy()
|
||||
expr_with_type["type"] = "grammar"
|
||||
all_expressions.append(expr_with_type)
|
||||
all_situations.append(f"{len(all_expressions)}.{expr['situation']}")
|
||||
|
||||
if not all_expressions:
|
||||
logger.warning("没有找到可用的表达方式")
|
||||
return []
|
||||
|
||||
all_situations_str = "\n".join(all_situations)
|
||||
|
||||
if target_message:
|
||||
target_message_str = f",现在你想要回复消息:{target_message}"
|
||||
target_message_extra_block = "4.考虑你要回复的目标消息"
|
||||
else:
|
||||
target_message_str = ""
|
||||
target_message_extra_block = ""
|
||||
|
||||
# 3. 构建prompt(只包含情境,不包含完整的表达方式)
|
||||
prompt = (await global_prompt_manager.get_prompt_async("expression_evaluation_prompt")).format(
|
||||
bot_name=global_config.bot.nickname,
|
||||
chat_observe_info=chat_info,
|
||||
all_situations=all_situations_str,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
target_message=target_message_str,
|
||||
target_message_extra_block=target_message_extra_block,
|
||||
)
|
||||
|
||||
# print(prompt)
|
||||
|
||||
# 4. 调用LLM
|
||||
try:
|
||||
|
||||
# start_time = time.time()
|
||||
content, (reasoning_content, model_name, _) = await self.llm_model.generate_response_async(prompt=prompt)
|
||||
# logger.info(f"LLM请求时间: {model_name} {time.time() - start_time} \n{prompt}")
|
||||
|
||||
# logger.info(f"模型名称: {model_name}")
|
||||
# logger.info(f"LLM返回结果: {content}")
|
||||
# if reasoning_content:
|
||||
# logger.info(f"LLM推理: {reasoning_content}")
|
||||
# else:
|
||||
# logger.info(f"LLM推理: 无")
|
||||
|
||||
if not content:
|
||||
logger.warning("LLM返回空结果")
|
||||
return []
|
||||
|
||||
# 5. 解析结果
|
||||
result = repair_json(content)
|
||||
if isinstance(result, str):
|
||||
result = json.loads(result)
|
||||
|
||||
if not isinstance(result, dict) or "selected_situations" not in result:
|
||||
logger.error("LLM返回格式错误")
|
||||
logger.info(f"LLM返回结果: \n{content}")
|
||||
return []
|
||||
|
||||
selected_indices = result["selected_situations"]
|
||||
|
||||
# 根据索引获取完整的表达方式
|
||||
valid_expressions = []
|
||||
for idx in selected_indices:
|
||||
if isinstance(idx, int) and 1 <= idx <= len(all_expressions):
|
||||
expression = all_expressions[idx - 1] # 索引从1开始
|
||||
valid_expressions.append(expression)
|
||||
|
||||
# 对选中的所有表达方式,一次性更新count数
|
||||
if valid_expressions:
|
||||
self.update_expressions_count_batch(valid_expressions, 0.006)
|
||||
|
||||
# logger.info(f"LLM从{len(all_expressions)}个情境中选择了{len(valid_expressions)}个")
|
||||
return valid_expressions
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM处理表达方式选择时出错: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
init_prompt()
|
||||
|
||||
try:
|
||||
expression_selector = ExpressionSelector()
|
||||
except Exception as e:
|
||||
print(f"ExpressionSelector初始化失败: {e}")
|
||||
40
src/chat/heart_flow/heartflow.py
Normal file
40
src/chat/heart_flow/heartflow.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import traceback
|
||||
from typing import Any, Optional, Dict
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.heart_flow.sub_heartflow import SubHeartflow
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
|
||||
logger = get_logger("heartflow")
|
||||
|
||||
|
||||
class Heartflow:
|
||||
"""主心流协调器,负责初始化并协调聊天"""
|
||||
|
||||
def __init__(self):
|
||||
self.subheartflows: Dict[Any, "SubHeartflow"] = {}
|
||||
|
||||
async def get_or_create_subheartflow(self, subheartflow_id: Any) -> Optional["SubHeartflow"]:
|
||||
"""获取或创建一个新的SubHeartflow实例"""
|
||||
if subheartflow_id in self.subheartflows:
|
||||
if subflow := self.subheartflows.get(subheartflow_id):
|
||||
return subflow
|
||||
|
||||
try:
|
||||
new_subflow = SubHeartflow(subheartflow_id)
|
||||
|
||||
await new_subflow.initialize()
|
||||
|
||||
# 注册子心流
|
||||
self.subheartflows[subheartflow_id] = new_subflow
|
||||
heartflow_name = get_chat_manager().get_stream_name(subheartflow_id) or subheartflow_id
|
||||
logger.info(f"[{heartflow_name}] 开始接收消息")
|
||||
|
||||
return new_subflow
|
||||
except Exception as e:
|
||||
logger.error(f"创建子心流 {subheartflow_id} 失败: {e}", exc_info=True)
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
heartflow = Heartflow()
|
||||
173
src/chat/heart_flow/heartflow_message_processor.py
Normal file
173
src/chat/heart_flow/heartflow_message_processor.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import asyncio
|
||||
import re
|
||||
import math
|
||||
import traceback
|
||||
|
||||
from typing import Tuple, TYPE_CHECKING
|
||||
|
||||
from src.config.config import global_config
|
||||
from src.chat.memory_system.Hippocampus import hippocampus_manager
|
||||
from src.chat.message_receive.message import MessageRecv
|
||||
from src.chat.message_receive.storage import MessageStorage
|
||||
from src.chat.heart_flow.heartflow import heartflow
|
||||
from src.chat.utils.utils import is_mentioned_bot_in_message
|
||||
from src.chat.utils.timer_calculator import Timer
|
||||
from src.chat.utils.chat_message_builder import replace_user_references_sync
|
||||
from src.common.logger import get_logger
|
||||
from src.person_info.relationship_manager import get_relationship_manager
|
||||
from src.mood.mood_manager import mood_manager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from src.chat.heart_flow.sub_heartflow import SubHeartflow
|
||||
|
||||
logger = get_logger("chat")
|
||||
|
||||
|
||||
async def _process_relationship(message: MessageRecv) -> None:
|
||||
"""处理用户关系逻辑
|
||||
|
||||
Args:
|
||||
message: 消息对象,包含用户信息
|
||||
"""
|
||||
platform = message.message_info.platform
|
||||
user_id = message.message_info.user_info.user_id # type: ignore
|
||||
nickname = message.message_info.user_info.user_nickname # type: ignore
|
||||
cardname = message.message_info.user_info.user_cardname or nickname # type: ignore
|
||||
|
||||
relationship_manager = get_relationship_manager()
|
||||
is_known = await relationship_manager.is_known_some_one(platform, user_id)
|
||||
|
||||
if not is_known:
|
||||
logger.info(f"首次认识用户: {nickname}")
|
||||
await relationship_manager.first_knowing_some_one(platform, user_id, nickname, cardname) # type: ignore
|
||||
|
||||
|
||||
async def _calculate_interest(message: MessageRecv) -> Tuple[float, bool, list[str]]:
|
||||
"""计算消息的兴趣度
|
||||
|
||||
Args:
|
||||
message: 待处理的消息对象
|
||||
|
||||
Returns:
|
||||
Tuple[float, bool, list[str]]: (兴趣度, 是否被提及, 关键词)
|
||||
"""
|
||||
is_mentioned, _ = is_mentioned_bot_in_message(message)
|
||||
interested_rate = 0.0
|
||||
|
||||
with Timer("记忆激活"):
|
||||
interested_rate, keywords = await hippocampus_manager.get_activate_from_text(
|
||||
message.processed_plain_text,
|
||||
max_depth= 5,
|
||||
fast_retrieval=False,
|
||||
)
|
||||
logger.debug(f"记忆激活率: {interested_rate:.2f}, 关键词: {keywords}")
|
||||
|
||||
text_len = len(message.processed_plain_text)
|
||||
# 根据文本长度分布调整兴趣度,采用分段函数实现更精确的兴趣度计算
|
||||
# 基于实际分布:0-5字符(26.57%), 6-10字符(27.18%), 11-20字符(22.76%), 21-30字符(10.33%), 31+字符(13.86%)
|
||||
|
||||
if text_len == 0:
|
||||
base_interest = 0.01 # 空消息最低兴趣度
|
||||
elif text_len <= 5:
|
||||
# 1-5字符:线性增长 0.01 -> 0.03
|
||||
base_interest = 0.01 + (text_len - 1) * (0.03 - 0.01) / 4
|
||||
elif text_len <= 10:
|
||||
# 6-10字符:线性增长 0.03 -> 0.06
|
||||
base_interest = 0.03 + (text_len - 5) * (0.06 - 0.03) / 5
|
||||
elif text_len <= 20:
|
||||
# 11-20字符:线性增长 0.06 -> 0.12
|
||||
base_interest = 0.06 + (text_len - 10) * (0.12 - 0.06) / 10
|
||||
elif text_len <= 30:
|
||||
# 21-30字符:线性增长 0.12 -> 0.18
|
||||
base_interest = 0.12 + (text_len - 20) * (0.18 - 0.12) / 10
|
||||
elif text_len <= 50:
|
||||
# 31-50字符:线性增长 0.18 -> 0.22
|
||||
base_interest = 0.18 + (text_len - 30) * (0.22 - 0.18) / 20
|
||||
elif text_len <= 100:
|
||||
# 51-100字符:线性增长 0.22 -> 0.26
|
||||
base_interest = 0.22 + (text_len - 50) * (0.26 - 0.22) / 50
|
||||
else:
|
||||
# 100+字符:对数增长 0.26 -> 0.3,增长率递减
|
||||
base_interest = 0.26 + (0.3 - 0.26) * (math.log10(text_len - 99) / math.log10(901)) # 1000-99=901
|
||||
|
||||
# 确保在范围内
|
||||
base_interest = min(max(base_interest, 0.01), 0.3)
|
||||
|
||||
interested_rate += base_interest
|
||||
|
||||
if is_mentioned:
|
||||
interest_increase_on_mention = 1
|
||||
interested_rate += interest_increase_on_mention
|
||||
|
||||
return interested_rate, is_mentioned, keywords
|
||||
|
||||
|
||||
class HeartFCMessageReceiver:
|
||||
"""心流处理器,负责处理接收到的消息并计算兴趣度"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化心流处理器,创建消息存储实例"""
|
||||
self.storage = MessageStorage()
|
||||
|
||||
async def process_message(self, message: MessageRecv) -> None:
|
||||
"""处理接收到的原始消息数据
|
||||
|
||||
主要流程:
|
||||
1. 消息解析与初始化
|
||||
2. 消息缓冲处理
|
||||
3. 过滤检查
|
||||
4. 兴趣度计算
|
||||
5. 关系处理
|
||||
|
||||
Args:
|
||||
message_data: 原始消息字符串
|
||||
"""
|
||||
try:
|
||||
# 1. 消息解析与初始化
|
||||
userinfo = message.message_info.user_info
|
||||
chat = message.chat_stream
|
||||
|
||||
# 2. 兴趣度计算与更新
|
||||
interested_rate, is_mentioned, keywords = await _calculate_interest(message)
|
||||
message.interest_value = interested_rate
|
||||
message.is_mentioned = is_mentioned
|
||||
|
||||
await self.storage.store_message(message, chat)
|
||||
|
||||
subheartflow: SubHeartflow = await heartflow.get_or_create_subheartflow(chat.stream_id) # type: ignore
|
||||
|
||||
# subheartflow.add_message_to_normal_chat_cache(message, interested_rate, is_mentioned)
|
||||
if global_config.mood.enable_mood:
|
||||
chat_mood = mood_manager.get_mood_by_chat_id(subheartflow.chat_id)
|
||||
asyncio.create_task(chat_mood.update_mood_by_message(message, interested_rate))
|
||||
|
||||
# 3. 日志记录
|
||||
mes_name = chat.group_info.group_name if chat.group_info else "私聊"
|
||||
# current_time = time.strftime("%H:%M:%S", time.localtime(message.message_info.time))
|
||||
current_talk_frequency = global_config.chat.get_current_talk_frequency(chat.stream_id)
|
||||
|
||||
# 如果消息中包含图片标识,则将 [picid:...] 替换为 [图片]
|
||||
picid_pattern = r"\[picid:([^\]]+)\]"
|
||||
processed_plain_text = re.sub(picid_pattern, "[图片]", message.processed_plain_text)
|
||||
|
||||
# 应用用户引用格式替换,将回复<aaa:bbb>和@<aaa:bbb>格式转换为可读格式
|
||||
processed_plain_text = replace_user_references_sync(
|
||||
processed_plain_text,
|
||||
message.message_info.platform, # type: ignore
|
||||
replace_bot_name=True
|
||||
)
|
||||
|
||||
if keywords:
|
||||
logger.info(f"[{mes_name}]{userinfo.user_nickname}:{processed_plain_text}[兴趣度:{interested_rate:.2f}][关键词:{keywords}]") # type: ignore
|
||||
else:
|
||||
logger.info(f"[{mes_name}]{userinfo.user_nickname}:{processed_plain_text}[兴趣度:{interested_rate:.2f}]") # type: ignore
|
||||
|
||||
logger.debug(f"[{mes_name}][当前时段回复频率: {current_talk_frequency}]")
|
||||
|
||||
# 4. 关系处理
|
||||
if global_config.relationship.enable_relationship:
|
||||
await _process_relationship(message)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"消息处理失败: {e}")
|
||||
print(traceback.format_exc())
|
||||
41
src/chat/heart_flow/sub_heartflow.py
Normal file
41
src/chat/heart_flow/sub_heartflow.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from rich.traceback import install
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.chat.chat_loop.heartFC_chat import HeartFChatting
|
||||
from src.chat.utils.utils import get_chat_type_and_target_info
|
||||
|
||||
logger = get_logger("sub_heartflow")
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
|
||||
class SubHeartflow:
|
||||
def __init__(
|
||||
self,
|
||||
subheartflow_id,
|
||||
):
|
||||
"""子心流初始化函数
|
||||
|
||||
Args:
|
||||
subheartflow_id: 子心流唯一标识符
|
||||
"""
|
||||
# 基础属性,两个值是一样的
|
||||
self.subheartflow_id = subheartflow_id
|
||||
self.chat_id = subheartflow_id
|
||||
|
||||
self.is_group_chat, self.chat_target_info = get_chat_type_and_target_info(self.chat_id)
|
||||
self.log_prefix = get_chat_manager().get_stream_name(self.subheartflow_id) or self.subheartflow_id
|
||||
|
||||
# focus模式退出冷却时间管理
|
||||
self.last_focus_exit_time: float = 0 # 上次退出focus模式的时间
|
||||
|
||||
# 随便水群 normal_chat 和 认真水群 focus_chat 实例
|
||||
# CHAT模式激活 随便水群 FOCUS模式激活 认真水群
|
||||
self.heart_fc_instance: HeartFChatting = HeartFChatting(
|
||||
chat_id=self.subheartflow_id,
|
||||
) # 该sub_heartflow的HeartFChatting实例
|
||||
|
||||
async def initialize(self):
|
||||
"""异步初始化方法,创建兴趣流并确定聊天类型"""
|
||||
await self.heart_fc_instance.start()
|
||||
674
src/chat/knowledge/LICENSE
Normal file
674
src/chat/knowledge/LICENSE
Normal file
@@ -0,0 +1,674 @@
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
||||
0
src/chat/knowledge/__init__.py
Normal file
0
src/chat/knowledge/__init__.py
Normal file
592
src/chat/knowledge/embedding_store.py
Normal file
592
src/chat/knowledge/embedding_store.py
Normal file
@@ -0,0 +1,592 @@
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
import os
|
||||
import math
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# import tqdm
|
||||
import faiss
|
||||
|
||||
from .utils.hash import get_sha256
|
||||
from .global_logger import logger
|
||||
from rich.traceback import install
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
BarColumn,
|
||||
TimeElapsedColumn,
|
||||
TimeRemainingColumn,
|
||||
TaskProgressColumn,
|
||||
MofNCompleteColumn,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
)
|
||||
from src.chat.utils.utils import get_embedding
|
||||
from src.config.config import global_config
|
||||
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
# 多线程embedding配置常量
|
||||
DEFAULT_MAX_WORKERS = 10 # 默认最大线程数
|
||||
DEFAULT_CHUNK_SIZE = 10 # 默认每个线程处理的数据块大小
|
||||
MIN_CHUNK_SIZE = 1 # 最小分块大小
|
||||
MAX_CHUNK_SIZE = 50 # 最大分块大小
|
||||
MIN_WORKERS = 1 # 最小线程数
|
||||
MAX_WORKERS = 20 # 最大线程数
|
||||
|
||||
ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
EMBEDDING_DATA_DIR = os.path.join(ROOT_PATH, "data", "embedding")
|
||||
EMBEDDING_DATA_DIR_STR = str(EMBEDDING_DATA_DIR).replace("\\", "/")
|
||||
TOTAL_EMBEDDING_TIMES = 3 # 统计嵌入次数
|
||||
|
||||
# 嵌入模型测试字符串,测试模型一致性,来自开发群的聊天记录
|
||||
# 这些字符串的嵌入结果应该是固定的,不能随时间变化
|
||||
EMBEDDING_TEST_STRINGS = [
|
||||
"阿卡伊真的太好玩了,神秘性感大女同等着你",
|
||||
"你怎么知道我arc12.64了",
|
||||
"我是蕾缪乐小姐的狗",
|
||||
"关注Oct谢谢喵",
|
||||
"不是w6我不草",
|
||||
"关注千石可乐谢谢喵",
|
||||
"来玩CLANNAD,AIR,樱之诗,樱之刻谢谢喵",
|
||||
"关注墨梓柒谢谢喵",
|
||||
"Ciallo~",
|
||||
"来玩巧克甜恋谢谢喵",
|
||||
"水印",
|
||||
"我也在纠结晚饭,铁锅炒鸡听着就香!",
|
||||
"test你妈喵",
|
||||
]
|
||||
EMBEDDING_TEST_FILE = os.path.join(ROOT_PATH, "data", "embedding_model_test.json")
|
||||
EMBEDDING_SIM_THRESHOLD = 0.99
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
# 计算余弦相似度
|
||||
dot = sum(x * y for x, y in zip(a, b, strict=False))
|
||||
norm_a = math.sqrt(sum(x * x for x in a))
|
||||
norm_b = math.sqrt(sum(x * x for x in b))
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
return dot / (norm_a * norm_b)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmbeddingStoreItem:
|
||||
"""嵌入库中的项"""
|
||||
|
||||
def __init__(self, item_hash: str, embedding: List[float], content: str):
|
||||
self.hash = item_hash
|
||||
self.embedding = embedding
|
||||
self.str = content
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""转为dict"""
|
||||
return {
|
||||
"hash": self.hash,
|
||||
"embedding": self.embedding,
|
||||
"str": self.str,
|
||||
}
|
||||
|
||||
|
||||
class EmbeddingStore:
|
||||
def __init__(self, namespace: str, dir_path: str, max_workers: int = DEFAULT_MAX_WORKERS, chunk_size: int = DEFAULT_CHUNK_SIZE):
|
||||
self.namespace = namespace
|
||||
self.dir = dir_path
|
||||
self.embedding_file_path = f"{dir_path}/{namespace}.parquet"
|
||||
self.index_file_path = f"{dir_path}/{namespace}.index"
|
||||
self.idx2hash_file_path = dir_path + "/" + namespace + "_i2h.json"
|
||||
|
||||
# 多线程配置参数验证和设置
|
||||
self.max_workers = max(MIN_WORKERS, min(MAX_WORKERS, max_workers))
|
||||
self.chunk_size = max(MIN_CHUNK_SIZE, min(MAX_CHUNK_SIZE, chunk_size))
|
||||
|
||||
# 如果配置值被调整,记录日志
|
||||
if self.max_workers != max_workers:
|
||||
logger.warning(f"max_workers 已从 {max_workers} 调整为 {self.max_workers} (范围: {MIN_WORKERS}-{MAX_WORKERS})")
|
||||
if self.chunk_size != chunk_size:
|
||||
logger.warning(f"chunk_size 已从 {chunk_size} 调整为 {self.chunk_size} (范围: {MIN_CHUNK_SIZE}-{MAX_CHUNK_SIZE})")
|
||||
|
||||
self.store = {}
|
||||
|
||||
self.faiss_index = None
|
||||
self.idx2hash = None
|
||||
|
||||
def _get_embedding(self, s: str) -> List[float]:
|
||||
"""获取字符串的嵌入向量,处理异步调用"""
|
||||
try:
|
||||
# 尝试获取当前事件循环
|
||||
asyncio.get_running_loop()
|
||||
# 如果在事件循环中,使用线程池执行
|
||||
import concurrent.futures
|
||||
|
||||
def run_in_thread():
|
||||
return asyncio.run(get_embedding(s))
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(run_in_thread)
|
||||
result = future.result()
|
||||
if result is None:
|
||||
logger.error(f"获取嵌入失败: {s}")
|
||||
return []
|
||||
return result
|
||||
except RuntimeError:
|
||||
# 没有运行的事件循环,直接运行
|
||||
result = asyncio.run(get_embedding(s))
|
||||
if result is None:
|
||||
logger.error(f"获取嵌入失败: {s}")
|
||||
return []
|
||||
return result
|
||||
|
||||
def _get_embeddings_batch_threaded(self, strs: List[str], chunk_size: int = 10, max_workers: int = 10, progress_callback=None) -> List[Tuple[str, List[float]]]:
|
||||
"""使用多线程批量获取嵌入向量
|
||||
|
||||
Args:
|
||||
strs: 要获取嵌入的字符串列表
|
||||
chunk_size: 每个线程处理的数据块大小
|
||||
max_workers: 最大线程数
|
||||
progress_callback: 进度回调函数,接收一个参数表示完成的数量
|
||||
|
||||
Returns:
|
||||
包含(原始字符串, 嵌入向量)的元组列表,保持与输入顺序一致
|
||||
"""
|
||||
if not strs:
|
||||
return []
|
||||
|
||||
# 分块
|
||||
chunks = []
|
||||
for i in range(0, len(strs), chunk_size):
|
||||
chunk = strs[i:i + chunk_size]
|
||||
chunks.append((i, chunk)) # 保存起始索引以维持顺序
|
||||
|
||||
# 结果存储,使用字典按索引存储以保证顺序
|
||||
results = {}
|
||||
|
||||
def process_chunk(chunk_data):
|
||||
"""处理单个数据块的函数"""
|
||||
start_idx, chunk_strs = chunk_data
|
||||
chunk_results = []
|
||||
|
||||
# 为每个线程创建独立的LLMRequest实例
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import model_config
|
||||
|
||||
try:
|
||||
# 创建线程专用的LLM实例
|
||||
llm = LLMRequest(model_set=model_config.model_task_config.embedding, request_type="embedding")
|
||||
|
||||
for i, s in enumerate(chunk_strs):
|
||||
try:
|
||||
# 直接使用异步函数
|
||||
embedding = asyncio.run(llm.get_embedding(s))
|
||||
if embedding and len(embedding) > 0:
|
||||
chunk_results.append((start_idx + i, s, embedding[0])) # embedding[0] 是实际的向量
|
||||
else:
|
||||
logger.error(f"获取嵌入失败: {s}")
|
||||
chunk_results.append((start_idx + i, s, []))
|
||||
|
||||
# 每完成一个嵌入立即更新进度
|
||||
if progress_callback:
|
||||
progress_callback(1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取嵌入时发生异常: {s}, 错误: {e}")
|
||||
chunk_results.append((start_idx + i, s, []))
|
||||
|
||||
# 即使失败也要更新进度
|
||||
if progress_callback:
|
||||
progress_callback(1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建LLM实例失败: {e}")
|
||||
# 如果创建LLM实例失败,返回空结果
|
||||
for i, s in enumerate(chunk_strs):
|
||||
chunk_results.append((start_idx + i, s, []))
|
||||
# 即使失败也要更新进度
|
||||
if progress_callback:
|
||||
progress_callback(1)
|
||||
|
||||
return chunk_results
|
||||
|
||||
# 使用线程池处理
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 提交所有任务
|
||||
future_to_chunk = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}
|
||||
|
||||
# 收集结果(进度已在process_chunk中实时更新)
|
||||
for future in as_completed(future_to_chunk):
|
||||
try:
|
||||
chunk_results = future.result()
|
||||
for idx, s, embedding in chunk_results:
|
||||
results[idx] = (s, embedding)
|
||||
except Exception as e:
|
||||
chunk = future_to_chunk[future]
|
||||
logger.error(f"处理数据块时发生异常: {chunk}, 错误: {e}")
|
||||
# 为失败的块添加空结果
|
||||
start_idx, chunk_strs = chunk
|
||||
for i, s in enumerate(chunk_strs):
|
||||
results[start_idx + i] = (s, [])
|
||||
|
||||
# 按原始顺序返回结果
|
||||
ordered_results = []
|
||||
for i in range(len(strs)):
|
||||
if i in results:
|
||||
ordered_results.append(results[i])
|
||||
else:
|
||||
# 防止遗漏
|
||||
ordered_results.append((strs[i], []))
|
||||
|
||||
return ordered_results
|
||||
|
||||
def get_test_file_path(self):
|
||||
return EMBEDDING_TEST_FILE
|
||||
|
||||
def save_embedding_test_vectors(self):
|
||||
"""保存测试字符串的嵌入到本地(使用多线程优化)"""
|
||||
logger.info("开始保存测试字符串的嵌入向量...")
|
||||
|
||||
# 使用多线程批量获取测试字符串的嵌入
|
||||
embedding_results = self._get_embeddings_batch_threaded(
|
||||
EMBEDDING_TEST_STRINGS,
|
||||
chunk_size=min(self.chunk_size, len(EMBEDDING_TEST_STRINGS)),
|
||||
max_workers=min(self.max_workers, len(EMBEDDING_TEST_STRINGS))
|
||||
)
|
||||
|
||||
# 构建测试向量字典
|
||||
test_vectors = {}
|
||||
for idx, (s, embedding) in enumerate(embedding_results):
|
||||
if embedding:
|
||||
test_vectors[str(idx)] = embedding
|
||||
else:
|
||||
logger.error(f"获取测试字符串嵌入失败: {s}")
|
||||
# 使用原始单线程方法作为后备
|
||||
test_vectors[str(idx)] = self._get_embedding(s)
|
||||
|
||||
with open(self.get_test_file_path(), "w", encoding="utf-8") as f:
|
||||
json.dump(test_vectors, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("测试字符串嵌入向量保存完成")
|
||||
|
||||
def load_embedding_test_vectors(self):
|
||||
"""加载本地保存的测试字符串嵌入"""
|
||||
path = self.get_test_file_path()
|
||||
if not os.path.exists(path):
|
||||
return None
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def check_embedding_model_consistency(self):
|
||||
"""校验当前模型与本地嵌入模型是否一致(使用多线程优化)"""
|
||||
local_vectors = self.load_embedding_test_vectors()
|
||||
if local_vectors is None:
|
||||
logger.warning("未检测到本地嵌入模型测试文件,将保存当前模型的测试嵌入。")
|
||||
self.save_embedding_test_vectors()
|
||||
return True
|
||||
|
||||
# 检查本地向量完整性
|
||||
for idx in range(len(EMBEDDING_TEST_STRINGS)):
|
||||
if local_vectors.get(str(idx)) is None:
|
||||
logger.warning("本地嵌入模型测试文件缺失部分测试字符串,将重新保存。")
|
||||
self.save_embedding_test_vectors()
|
||||
return True
|
||||
|
||||
logger.info("开始检验嵌入模型一致性...")
|
||||
|
||||
# 使用多线程批量获取当前模型的嵌入
|
||||
embedding_results = self._get_embeddings_batch_threaded(
|
||||
EMBEDDING_TEST_STRINGS,
|
||||
chunk_size=min(self.chunk_size, len(EMBEDDING_TEST_STRINGS)),
|
||||
max_workers=min(self.max_workers, len(EMBEDDING_TEST_STRINGS))
|
||||
)
|
||||
|
||||
# 检查一致性
|
||||
for idx, (s, new_emb) in enumerate(embedding_results):
|
||||
local_emb = local_vectors.get(str(idx))
|
||||
if not new_emb:
|
||||
logger.error(f"获取测试字符串嵌入失败: {s}")
|
||||
return False
|
||||
|
||||
sim = cosine_similarity(local_emb, new_emb)
|
||||
if sim < EMBEDDING_SIM_THRESHOLD:
|
||||
logger.error(f"嵌入模型一致性校验失败,字符串: {s}, 相似度: {sim:.4f}")
|
||||
return False
|
||||
|
||||
logger.info("嵌入模型一致性校验通过。")
|
||||
return True
|
||||
|
||||
def batch_insert_strs(self, strs: List[str], times: int) -> None:
|
||||
"""向库中存入字符串(使用多线程优化)"""
|
||||
if not strs:
|
||||
return
|
||||
|
||||
total = len(strs)
|
||||
|
||||
# 过滤已存在的字符串
|
||||
new_strs = []
|
||||
for s in strs:
|
||||
item_hash = self.namespace + "-" + get_sha256(s)
|
||||
if item_hash not in self.store:
|
||||
new_strs.append(s)
|
||||
|
||||
if not new_strs:
|
||||
logger.info(f"所有字符串已存在于{self.namespace}嵌入库中,跳过处理")
|
||||
return
|
||||
|
||||
logger.info(f"需要处理 {len(new_strs)}/{total} 个新字符串")
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
"•",
|
||||
TimeElapsedColumn(),
|
||||
"<",
|
||||
TimeRemainingColumn(),
|
||||
transient=False,
|
||||
) as progress:
|
||||
task = progress.add_task(f"存入嵌入库:({times}/{TOTAL_EMBEDDING_TIMES})", total=total)
|
||||
|
||||
# 首先更新已存在项的进度
|
||||
already_processed = total - len(new_strs)
|
||||
if already_processed > 0:
|
||||
progress.update(task, advance=already_processed)
|
||||
|
||||
if new_strs:
|
||||
# 使用实例配置的参数,智能调整分块和线程数
|
||||
optimal_chunk_size = max(MIN_CHUNK_SIZE, min(self.chunk_size, len(new_strs) // self.max_workers if self.max_workers > 0 else self.chunk_size))
|
||||
optimal_max_workers = min(self.max_workers, max(MIN_WORKERS, len(new_strs) // optimal_chunk_size if optimal_chunk_size > 0 else 1))
|
||||
|
||||
logger.debug(f"使用多线程处理: chunk_size={optimal_chunk_size}, max_workers={optimal_max_workers}")
|
||||
|
||||
# 定义进度更新回调函数
|
||||
def update_progress(count):
|
||||
progress.update(task, advance=count)
|
||||
|
||||
# 批量获取嵌入,并实时更新进度
|
||||
embedding_results = self._get_embeddings_batch_threaded(
|
||||
new_strs,
|
||||
chunk_size=optimal_chunk_size,
|
||||
max_workers=optimal_max_workers,
|
||||
progress_callback=update_progress
|
||||
)
|
||||
|
||||
# 存入结果(不再需要在这里更新进度,因为已经在回调中更新了)
|
||||
for s, embedding in embedding_results:
|
||||
item_hash = self.namespace + "-" + get_sha256(s)
|
||||
if embedding: # 只有成功获取到嵌入才存入
|
||||
self.store[item_hash] = EmbeddingStoreItem(item_hash, embedding, s)
|
||||
else:
|
||||
logger.warning(f"跳过存储失败的嵌入: {s[:50]}...")
|
||||
|
||||
def save_to_file(self) -> None:
|
||||
"""保存到文件"""
|
||||
data = []
|
||||
logger.info(f"正在保存{self.namespace}嵌入库到文件{self.embedding_file_path}")
|
||||
for item in self.store.values():
|
||||
data.append(item.to_dict())
|
||||
data_frame = pd.DataFrame(data)
|
||||
|
||||
if not os.path.exists(self.dir):
|
||||
os.makedirs(self.dir, exist_ok=True)
|
||||
if not os.path.exists(self.embedding_file_path):
|
||||
open(self.embedding_file_path, "w").close()
|
||||
|
||||
data_frame.to_parquet(self.embedding_file_path, engine="pyarrow", index=False)
|
||||
logger.info(f"{self.namespace}嵌入库保存成功")
|
||||
|
||||
if self.faiss_index is not None and self.idx2hash is not None:
|
||||
logger.info(f"正在保存{self.namespace}嵌入库的FaissIndex到文件{self.index_file_path}")
|
||||
faiss.write_index(self.faiss_index, self.index_file_path)
|
||||
logger.info(f"{self.namespace}嵌入库的FaissIndex保存成功")
|
||||
logger.info(f"正在保存{self.namespace}嵌入库的idx2hash映射到文件{self.idx2hash_file_path}")
|
||||
with open(self.idx2hash_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(self.idx2hash, ensure_ascii=False, indent=4))
|
||||
logger.info(f"{self.namespace}嵌入库的idx2hash映射保存成功")
|
||||
|
||||
def load_from_file(self) -> None:
|
||||
"""从文件中加载"""
|
||||
if not os.path.exists(self.embedding_file_path):
|
||||
raise Exception(f"文件{self.embedding_file_path}不存在")
|
||||
logger.info("正在加载嵌入库...")
|
||||
logger.debug(f"正在从文件{self.embedding_file_path}中加载{self.namespace}嵌入库")
|
||||
data_frame = pd.read_parquet(self.embedding_file_path, engine="pyarrow")
|
||||
total = len(data_frame)
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
"•",
|
||||
TimeElapsedColumn(),
|
||||
"<",
|
||||
TimeRemainingColumn(),
|
||||
transient=False,
|
||||
) as progress:
|
||||
task = progress.add_task("加载嵌入库", total=total)
|
||||
for _, row in data_frame.iterrows():
|
||||
self.store[row["hash"]] = EmbeddingStoreItem(row["hash"], row["embedding"], row["str"])
|
||||
progress.update(task, advance=1)
|
||||
logger.info(f"{self.namespace}嵌入库加载成功")
|
||||
|
||||
try:
|
||||
if os.path.exists(self.index_file_path):
|
||||
logger.info(f"正在加载{self.namespace}嵌入库的FaissIndex...")
|
||||
logger.debug(f"正在从文件{self.index_file_path}中加载{self.namespace}嵌入库的FaissIndex")
|
||||
self.faiss_index = faiss.read_index(self.index_file_path)
|
||||
logger.info(f"{self.namespace}嵌入库的FaissIndex加载成功")
|
||||
else:
|
||||
raise Exception(f"文件{self.index_file_path}不存在")
|
||||
if os.path.exists(self.idx2hash_file_path):
|
||||
logger.info(f"正在加载{self.namespace}嵌入库的idx2hash映射...")
|
||||
logger.debug(f"正在从文件{self.idx2hash_file_path}中加载{self.namespace}嵌入库的idx2hash映射")
|
||||
with open(self.idx2hash_file_path, "r") as f:
|
||||
self.idx2hash = json.load(f)
|
||||
logger.info(f"{self.namespace}嵌入库的idx2hash映射加载成功")
|
||||
else:
|
||||
raise Exception(f"文件{self.idx2hash_file_path}不存在")
|
||||
except Exception as e:
|
||||
logger.error(f"加载{self.namespace}嵌入库的FaissIndex时发生错误:{e}")
|
||||
logger.warning("正在重建Faiss索引")
|
||||
self.build_faiss_index()
|
||||
logger.info(f"{self.namespace}嵌入库的FaissIndex重建成功")
|
||||
self.save_to_file()
|
||||
|
||||
def build_faiss_index(self) -> None:
|
||||
"""重新构建Faiss索引,以余弦相似度为度量"""
|
||||
# 获取所有的embedding
|
||||
array = []
|
||||
self.idx2hash = dict()
|
||||
for key in self.store:
|
||||
array.append(self.store[key].embedding)
|
||||
self.idx2hash[str(len(array) - 1)] = key
|
||||
embeddings = np.array(array, dtype=np.float32)
|
||||
# L2归一化
|
||||
faiss.normalize_L2(embeddings)
|
||||
# 构建索引
|
||||
self.faiss_index = faiss.IndexFlatIP(global_config.lpmm_knowledge.embedding_dimension)
|
||||
self.faiss_index.add(embeddings)
|
||||
|
||||
def search_top_k(self, query: List[float], k: int) -> List[Tuple[str, float]]:
|
||||
"""搜索最相似的k个项,以余弦相似度为度量
|
||||
Args:
|
||||
query: 查询的embedding
|
||||
k: 返回的最相似的k个项
|
||||
Returns:
|
||||
result: 最相似的k个项的(hash, 余弦相似度)列表
|
||||
"""
|
||||
if self.faiss_index is None:
|
||||
logger.debug("FaissIndex尚未构建,返回None")
|
||||
return []
|
||||
if self.idx2hash is None:
|
||||
logger.warning("idx2hash尚未构建,返回None")
|
||||
return []
|
||||
|
||||
# L2归一化
|
||||
faiss.normalize_L2(np.array([query], dtype=np.float32))
|
||||
# 搜索
|
||||
distances, indices = self.faiss_index.search(np.array([query]), k)
|
||||
# 整理结果
|
||||
indices = list(indices.flatten())
|
||||
distances = list(distances.flatten())
|
||||
result = [
|
||||
(self.idx2hash[str(int(idx))], float(sim))
|
||||
for (idx, sim) in zip(indices, distances, strict=False)
|
||||
if idx in range(len(self.idx2hash))
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class EmbeddingManager:
|
||||
def __init__(self, max_workers: int = DEFAULT_MAX_WORKERS, chunk_size: int = DEFAULT_CHUNK_SIZE):
|
||||
"""
|
||||
初始化EmbeddingManager
|
||||
|
||||
Args:
|
||||
max_workers: 最大线程数
|
||||
chunk_size: 每个线程处理的数据块大小
|
||||
"""
|
||||
self.paragraphs_embedding_store = EmbeddingStore(
|
||||
"paragraph", # type: ignore
|
||||
EMBEDDING_DATA_DIR_STR,
|
||||
max_workers=max_workers,
|
||||
chunk_size=chunk_size,
|
||||
)
|
||||
self.entities_embedding_store = EmbeddingStore(
|
||||
"entity", # type: ignore
|
||||
EMBEDDING_DATA_DIR_STR,
|
||||
max_workers=max_workers,
|
||||
chunk_size=chunk_size,
|
||||
)
|
||||
self.relation_embedding_store = EmbeddingStore(
|
||||
"relation", # type: ignore
|
||||
EMBEDDING_DATA_DIR_STR,
|
||||
max_workers=max_workers,
|
||||
chunk_size=chunk_size,
|
||||
)
|
||||
self.stored_pg_hashes = set()
|
||||
|
||||
def check_all_embedding_model_consistency(self):
|
||||
"""对所有嵌入库做模型一致性校验"""
|
||||
return self.paragraphs_embedding_store.check_embedding_model_consistency()
|
||||
|
||||
def _store_pg_into_embedding(self, raw_paragraphs: Dict[str, str]):
|
||||
"""将段落编码存入Embedding库"""
|
||||
self.paragraphs_embedding_store.batch_insert_strs(list(raw_paragraphs.values()), times=1)
|
||||
|
||||
def _store_ent_into_embedding(self, triple_list_data: Dict[str, List[List[str]]]):
|
||||
"""将实体编码存入Embedding库"""
|
||||
entities = set()
|
||||
for triple_list in triple_list_data.values():
|
||||
for triple in triple_list:
|
||||
entities.add(triple[0])
|
||||
entities.add(triple[2])
|
||||
self.entities_embedding_store.batch_insert_strs(list(entities), times=2)
|
||||
|
||||
def _store_rel_into_embedding(self, triple_list_data: Dict[str, List[List[str]]]):
|
||||
"""将关系编码存入Embedding库"""
|
||||
graph_triples = [] # a list of unique relation triple (in tuple) from all chunks
|
||||
for triples in triple_list_data.values():
|
||||
graph_triples.extend([tuple(t) for t in triples])
|
||||
graph_triples = list(set(graph_triples))
|
||||
self.relation_embedding_store.batch_insert_strs([str(triple) for triple in graph_triples], times=3)
|
||||
|
||||
def load_from_file(self):
|
||||
"""从文件加载"""
|
||||
self.paragraphs_embedding_store.load_from_file()
|
||||
self.entities_embedding_store.load_from_file()
|
||||
self.relation_embedding_store.load_from_file()
|
||||
# 从段落库中获取已存储的hash
|
||||
self.stored_pg_hashes = set(self.paragraphs_embedding_store.store.keys())
|
||||
|
||||
def store_new_data_set(
|
||||
self,
|
||||
raw_paragraphs: Dict[str, str],
|
||||
triple_list_data: Dict[str, List[List[str]]],
|
||||
):
|
||||
if not self.check_all_embedding_model_consistency():
|
||||
raise Exception("嵌入模型与本地存储不一致,请检查模型设置或清空嵌入库后重试。")
|
||||
"""存储新的数据集"""
|
||||
self._store_pg_into_embedding(raw_paragraphs)
|
||||
self._store_ent_into_embedding(triple_list_data)
|
||||
self._store_rel_into_embedding(triple_list_data)
|
||||
self.stored_pg_hashes.update(raw_paragraphs.keys())
|
||||
|
||||
def save_to_file(self):
|
||||
"""保存到文件"""
|
||||
self.paragraphs_embedding_store.save_to_file()
|
||||
self.entities_embedding_store.save_to_file()
|
||||
self.relation_embedding_store.save_to_file()
|
||||
|
||||
def rebuild_faiss_index(self):
|
||||
"""重建Faiss索引(请在添加新数据后调用)"""
|
||||
self.paragraphs_embedding_store.build_faiss_index()
|
||||
self.entities_embedding_store.build_faiss_index()
|
||||
self.relation_embedding_store.build_faiss_index()
|
||||
5
src/chat/knowledge/global_logger.py
Normal file
5
src/chat/knowledge/global_logger.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Configure logger
|
||||
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("lpmm")
|
||||
175
src/chat/knowledge/ie_process.py
Normal file
175
src/chat/knowledge/ie_process.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from typing import List, Union
|
||||
|
||||
from .global_logger import logger
|
||||
from . import prompt_template
|
||||
from .knowledge_lib import INVALID_ENTITY
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from json_repair import repair_json
|
||||
|
||||
|
||||
def _extract_json_from_text(text: str):
|
||||
# sourcery skip: assign-if-exp, extract-method
|
||||
"""从文本中提取JSON数据的高容错方法"""
|
||||
if text is None:
|
||||
logger.error("输入文本为None")
|
||||
return []
|
||||
|
||||
try:
|
||||
fixed_json = repair_json(text)
|
||||
if isinstance(fixed_json, str):
|
||||
parsed_json = json.loads(fixed_json)
|
||||
else:
|
||||
parsed_json = fixed_json
|
||||
|
||||
# 如果是列表,直接返回
|
||||
if isinstance(parsed_json, list):
|
||||
return parsed_json
|
||||
|
||||
# 如果是字典且只有一个项目,可能包装了列表
|
||||
if isinstance(parsed_json, dict):
|
||||
# 如果字典只有一个键,并且值是列表,返回那个列表
|
||||
if len(parsed_json) == 1:
|
||||
value = list(parsed_json.values())[0]
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
return parsed_json
|
||||
|
||||
# 其他情况,尝试转换为列表
|
||||
logger.warning(f"解析的JSON不是预期格式: {type(parsed_json)}, 内容: {parsed_json}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"JSON提取失败: {e}, 原始文本: {text[:100] if text else 'None'}...")
|
||||
return []
|
||||
|
||||
|
||||
def _entity_extract(llm_req: LLMRequest, paragraph: str) -> List[str]:
|
||||
# sourcery skip: reintroduce-else, swap-if-else-branches, use-named-expression
|
||||
"""对段落进行实体提取,返回提取出的实体列表(JSON格式)"""
|
||||
entity_extract_context = prompt_template.build_entity_extract_context(paragraph)
|
||||
|
||||
# 使用 asyncio.run 来运行异步方法
|
||||
try:
|
||||
# 如果当前已有事件循环在运行,使用它
|
||||
loop = asyncio.get_running_loop()
|
||||
future = asyncio.run_coroutine_threadsafe(llm_req.generate_response_async(entity_extract_context), loop)
|
||||
response, _ = future.result()
|
||||
except RuntimeError:
|
||||
# 如果没有运行中的事件循环,直接使用 asyncio.run
|
||||
response, _ = asyncio.run(llm_req.generate_response_async(entity_extract_context))
|
||||
|
||||
# 添加调试日志
|
||||
logger.debug(f"LLM返回的原始响应: {response}")
|
||||
|
||||
entity_extract_result = _extract_json_from_text(response)
|
||||
|
||||
# 检查返回的是否为有效的实体列表
|
||||
if not isinstance(entity_extract_result, list):
|
||||
if not isinstance(entity_extract_result, dict):
|
||||
raise ValueError(f"实体提取结果格式错误,期望列表但得到: {type(entity_extract_result)}")
|
||||
|
||||
# 尝试常见的键名
|
||||
for key in ["entities", "result", "data", "items"]:
|
||||
if key in entity_extract_result and isinstance(entity_extract_result[key], list):
|
||||
entity_extract_result = entity_extract_result[key]
|
||||
break
|
||||
else:
|
||||
# 如果找不到合适的列表,抛出异常
|
||||
raise ValueError(f"实体提取结果格式错误,期望列表但得到: {type(entity_extract_result)}")
|
||||
# 过滤无效实体
|
||||
entity_extract_result = [
|
||||
entity
|
||||
for entity in entity_extract_result
|
||||
if (entity is not None) and (entity != "") and (entity not in INVALID_ENTITY)
|
||||
]
|
||||
|
||||
if not entity_extract_result:
|
||||
raise ValueError("实体提取结果为空")
|
||||
|
||||
return entity_extract_result
|
||||
|
||||
|
||||
def _rdf_triple_extract(llm_req: LLMRequest, paragraph: str, entities: list) -> List[List[str]]:
|
||||
"""对段落进行实体提取,返回提取出的实体列表(JSON格式)"""
|
||||
rdf_extract_context = prompt_template.build_rdf_triple_extract_context(
|
||||
paragraph, entities=json.dumps(entities, ensure_ascii=False)
|
||||
)
|
||||
|
||||
# 使用 asyncio.run 来运行异步方法
|
||||
try:
|
||||
# 如果当前已有事件循环在运行,使用它
|
||||
loop = asyncio.get_running_loop()
|
||||
future = asyncio.run_coroutine_threadsafe(llm_req.generate_response_async(rdf_extract_context), loop)
|
||||
response, _ = future.result()
|
||||
except RuntimeError:
|
||||
# 如果没有运行中的事件循环,直接使用 asyncio.run
|
||||
response, _ = asyncio.run(llm_req.generate_response_async(rdf_extract_context))
|
||||
|
||||
# 添加调试日志
|
||||
logger.debug(f"RDF LLM返回的原始响应: {response}")
|
||||
|
||||
rdf_triple_result = _extract_json_from_text(response)
|
||||
|
||||
# 检查返回的是否为有效的三元组列表
|
||||
if not isinstance(rdf_triple_result, list):
|
||||
if not isinstance(rdf_triple_result, dict):
|
||||
raise ValueError(f"RDF三元组提取结果格式错误,期望列表但得到: {type(rdf_triple_result)}")
|
||||
|
||||
# 尝试常见的键名
|
||||
for key in ["triples", "result", "data", "items"]:
|
||||
if key in rdf_triple_result and isinstance(rdf_triple_result[key], list):
|
||||
rdf_triple_result = rdf_triple_result[key]
|
||||
break
|
||||
else:
|
||||
# 如果找不到合适的列表,抛出异常
|
||||
raise ValueError(f"RDF三元组提取结果格式错误,期望列表但得到: {type(rdf_triple_result)}")
|
||||
# 验证三元组格式
|
||||
for triple in rdf_triple_result:
|
||||
if (
|
||||
not isinstance(triple, list)
|
||||
or len(triple) != 3
|
||||
or (triple[0] is None or triple[1] is None or triple[2] is None)
|
||||
or "" in triple
|
||||
):
|
||||
raise ValueError("RDF提取结果格式错误")
|
||||
|
||||
return rdf_triple_result
|
||||
|
||||
|
||||
def info_extract_from_str(
|
||||
llm_client_for_ner: LLMRequest, llm_client_for_rdf: LLMRequest, paragraph: str
|
||||
) -> Union[tuple[None, None], tuple[list[str], list[list[str]]]]:
|
||||
try_count = 0
|
||||
while True:
|
||||
try:
|
||||
entity_extract_result = _entity_extract(llm_client_for_ner, paragraph)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"实体提取失败,错误信息:{e}")
|
||||
try_count += 1
|
||||
if try_count < 3:
|
||||
logger.warning("将于5秒后重试")
|
||||
time.sleep(5)
|
||||
else:
|
||||
logger.error("实体提取失败,已达最大重试次数")
|
||||
return None, None
|
||||
|
||||
try_count = 0
|
||||
while True:
|
||||
try:
|
||||
rdf_triple_extract_result = _rdf_triple_extract(llm_client_for_rdf, paragraph, entity_extract_result)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"实体提取失败,错误信息:{e}")
|
||||
try_count += 1
|
||||
if try_count < 3:
|
||||
logger.warning("将于5秒后重试")
|
||||
time.sleep(5)
|
||||
else:
|
||||
logger.error("实体提取失败,已达最大重试次数")
|
||||
return None, None
|
||||
|
||||
return entity_extract_result, rdf_triple_extract_result
|
||||
438
src/chat/knowledge/kg_manager.py
Normal file
438
src/chat/knowledge/kg_manager.py
Normal file
@@ -0,0 +1,438 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from rich.progress import (
|
||||
Progress,
|
||||
BarColumn,
|
||||
TimeElapsedColumn,
|
||||
TimeRemainingColumn,
|
||||
TaskProgressColumn,
|
||||
MofNCompleteColumn,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
)
|
||||
from quick_algo import di_graph, pagerank
|
||||
|
||||
|
||||
from .utils.hash import get_sha256
|
||||
from .embedding_store import EmbeddingManager, EmbeddingStoreItem
|
||||
from src.config.config import global_config
|
||||
|
||||
from .global_logger import logger
|
||||
|
||||
|
||||
def _get_kg_dir():
|
||||
"""
|
||||
安全地获取KG数据目录路径
|
||||
"""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
root_path: str = os.path.abspath(os.path.join(current_dir, "..", "..", ".."))
|
||||
kg_dir = os.path.join(root_path, "data/rag")
|
||||
|
||||
return str(kg_dir).replace("\\", "/")
|
||||
|
||||
|
||||
# 延迟初始化,避免在模块加载时就访问可能未初始化的 local_storage
|
||||
def get_kg_dir_str():
|
||||
"""获取KG目录字符串"""
|
||||
return _get_kg_dir()
|
||||
|
||||
|
||||
class KGManager:
|
||||
def __init__(self):
|
||||
# 会被保存的字段
|
||||
# 存储段落的hash值,用于去重
|
||||
self.stored_paragraph_hashes = set()
|
||||
# 实体出现次数
|
||||
self.ent_appear_cnt = {}
|
||||
# KG
|
||||
self.graph = di_graph.DiGraph()
|
||||
|
||||
# 持久化相关 - 使用延迟初始化的路径
|
||||
self.dir_path = get_kg_dir_str()
|
||||
self.graph_data_path = self.dir_path + "/" + "rag-graph" + ".graphml"
|
||||
self.ent_cnt_data_path = self.dir_path + "/" + "rag-ent-cnt" + ".parquet"
|
||||
self.pg_hash_file_path = self.dir_path + "/" + "rag-pg-hash" + ".json"
|
||||
|
||||
def save_to_file(self):
|
||||
"""将KG数据保存到文件"""
|
||||
# 确保目录存在
|
||||
if not os.path.exists(self.dir_path):
|
||||
os.makedirs(self.dir_path, exist_ok=True)
|
||||
|
||||
# 保存KG
|
||||
di_graph.save_to_file(self.graph, self.graph_data_path)
|
||||
|
||||
# 保存实体计数到文件
|
||||
ent_cnt_df = pd.DataFrame([{"hash_key": k, "appear_cnt": v} for k, v in self.ent_appear_cnt.items()])
|
||||
ent_cnt_df.to_parquet(self.ent_cnt_data_path, engine="pyarrow", index=False)
|
||||
|
||||
# 保存段落hash到文件
|
||||
with open(self.pg_hash_file_path, "w", encoding="utf-8") as f:
|
||||
data = {"stored_paragraph_hashes": list(self.stored_paragraph_hashes)}
|
||||
f.write(json.dumps(data, ensure_ascii=False, indent=4))
|
||||
|
||||
def load_from_file(self):
|
||||
"""从文件加载KG数据"""
|
||||
# 确保文件存在
|
||||
if not os.path.exists(self.pg_hash_file_path):
|
||||
raise FileNotFoundError(f"KG段落hash文件{self.pg_hash_file_path}不存在")
|
||||
if not os.path.exists(self.ent_cnt_data_path):
|
||||
raise FileNotFoundError(f"KG实体计数文件{self.ent_cnt_data_path}不存在")
|
||||
if not os.path.exists(self.graph_data_path):
|
||||
raise FileNotFoundError(f"KG图文件{self.graph_data_path}不存在")
|
||||
|
||||
# 加载段落hash
|
||||
with open(self.pg_hash_file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
self.stored_paragraph_hashes = set(data["stored_paragraph_hashes"])
|
||||
|
||||
# 加载实体计数
|
||||
ent_cnt_df = pd.read_parquet(self.ent_cnt_data_path, engine="pyarrow")
|
||||
self.ent_appear_cnt = dict({row["hash_key"]: row["appear_cnt"] for _, row in ent_cnt_df.iterrows()})
|
||||
|
||||
# 加载KG
|
||||
self.graph = di_graph.load_from_file(self.graph_data_path)
|
||||
|
||||
def _build_edges_between_ent(
|
||||
self,
|
||||
node_to_node: Dict[Tuple[str, str], float],
|
||||
triple_list_data: Dict[str, List[List[str]]],
|
||||
):
|
||||
"""构建实体节点之间的关系,同时统计实体出现次数"""
|
||||
for triple_list in triple_list_data.values():
|
||||
entity_set = set()
|
||||
for triple in triple_list:
|
||||
if triple[0] == triple[2]:
|
||||
# 避免自连接
|
||||
continue
|
||||
# 一个triple就是一条边(同时构建双向联系)
|
||||
hash_key1 = "entity" + "-" + get_sha256(triple[0])
|
||||
hash_key2 = "entity" + "-" + get_sha256(triple[2])
|
||||
node_to_node[(hash_key1, hash_key2)] = node_to_node.get((hash_key1, hash_key2), 0) + 1.0
|
||||
node_to_node[(hash_key2, hash_key1)] = node_to_node.get((hash_key2, hash_key1), 0) + 1.0
|
||||
entity_set.add(hash_key1)
|
||||
entity_set.add(hash_key2)
|
||||
|
||||
# 实体出现次数统计
|
||||
for hash_key in entity_set:
|
||||
self.ent_appear_cnt[hash_key] = self.ent_appear_cnt.get(hash_key, 0) + 1.0
|
||||
|
||||
@staticmethod
|
||||
def _build_edges_between_ent_pg(
|
||||
node_to_node: Dict[Tuple[str, str], float],
|
||||
triple_list_data: Dict[str, List[List[str]]],
|
||||
):
|
||||
"""构建实体节点与文段节点之间的关系"""
|
||||
for idx in triple_list_data:
|
||||
for triple in triple_list_data[idx]:
|
||||
ent_hash_key = "entity" + "-" + get_sha256(triple[0])
|
||||
pg_hash_key = "paragraph" + "-" + str(idx)
|
||||
node_to_node[(ent_hash_key, pg_hash_key)] = node_to_node.get((ent_hash_key, pg_hash_key), 0) + 1.0
|
||||
|
||||
@staticmethod
|
||||
def _synonym_connect(
|
||||
node_to_node: Dict[Tuple[str, str], float],
|
||||
triple_list_data: Dict[str, List[List[str]]],
|
||||
embedding_manager: EmbeddingManager,
|
||||
) -> int:
|
||||
"""同义词连接"""
|
||||
new_edge_cnt = 0
|
||||
# 获取所有实体节点的hash值
|
||||
ent_hash_list = set()
|
||||
for triple_list in triple_list_data.values():
|
||||
for triple in triple_list:
|
||||
ent_hash_list.add("entity" + "-" + get_sha256(triple[0]))
|
||||
ent_hash_list.add("entity" + "-" + get_sha256(triple[2]))
|
||||
ent_hash_list = list(ent_hash_list)
|
||||
|
||||
synonym_hash_set = set()
|
||||
synonym_result = {}
|
||||
|
||||
# rich 进度条
|
||||
total = len(ent_hash_list)
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
"•",
|
||||
TimeElapsedColumn(),
|
||||
"<",
|
||||
TimeRemainingColumn(),
|
||||
transient=False,
|
||||
) as progress:
|
||||
task = progress.add_task("同义词连接", total=total)
|
||||
for ent_hash in ent_hash_list:
|
||||
if ent_hash in synonym_hash_set:
|
||||
progress.update(task, advance=1)
|
||||
continue
|
||||
ent = embedding_manager.entities_embedding_store.store.get(ent_hash)
|
||||
if ent is None:
|
||||
progress.update(task, advance=1)
|
||||
continue
|
||||
assert isinstance(ent, EmbeddingStoreItem)
|
||||
# 查询相似实体
|
||||
similar_ents = embedding_manager.entities_embedding_store.search_top_k(
|
||||
ent.embedding, global_config.lpmm_knowledge.rag_synonym_search_top_k
|
||||
)
|
||||
res_ent = [] # Debug
|
||||
for res_ent_hash, similarity in similar_ents:
|
||||
if res_ent_hash == ent_hash:
|
||||
# 避免自连接
|
||||
continue
|
||||
if similarity < global_config.lpmm_knowledge.rag_synonym_threshold:
|
||||
# 相似度阈值
|
||||
continue
|
||||
node_to_node[(res_ent_hash, ent_hash)] = similarity
|
||||
node_to_node[(ent_hash, res_ent_hash)] = similarity
|
||||
synonym_hash_set.add(res_ent_hash)
|
||||
new_edge_cnt += 1
|
||||
res_ent.append(
|
||||
(
|
||||
embedding_manager.entities_embedding_store.store[res_ent_hash].str,
|
||||
similarity,
|
||||
)
|
||||
) # Debug
|
||||
synonym_result[ent.str] = res_ent
|
||||
progress.update(task, advance=1)
|
||||
|
||||
for k, v in synonym_result.items():
|
||||
print(f'"{k}"的相似实体为:{v}')
|
||||
return new_edge_cnt
|
||||
|
||||
def _update_graph(
|
||||
self,
|
||||
node_to_node: Dict[Tuple[str, str], float],
|
||||
embedding_manager: EmbeddingManager,
|
||||
):
|
||||
"""更新KG图结构
|
||||
|
||||
流程:
|
||||
1. 更新图结构:遍历所有待添加的新边
|
||||
- 若是新边,则添加到图中
|
||||
- 若是已存在的边,则更新边的权重
|
||||
2. 更新新节点的属性
|
||||
"""
|
||||
existed_nodes = self.graph.get_node_list()
|
||||
existed_edges = [str((edge[0], edge[1])) for edge in self.graph.get_edge_list()]
|
||||
|
||||
now_time = time.time()
|
||||
|
||||
# 更新图结构
|
||||
for src_tgt, weight in node_to_node.items():
|
||||
key = str(src_tgt)
|
||||
# 检查边是否已存在
|
||||
if key not in existed_edges:
|
||||
# 新边
|
||||
self.graph.add_edge(
|
||||
di_graph.DiEdge(
|
||||
src_tgt[0],
|
||||
src_tgt[1],
|
||||
{
|
||||
"weight": weight,
|
||||
"create_time": now_time,
|
||||
"update_time": now_time,
|
||||
},
|
||||
)
|
||||
)
|
||||
else:
|
||||
# 已存在的边
|
||||
edge_item = self.graph[src_tgt[0], src_tgt[1]]
|
||||
edge_item["weight"] += weight
|
||||
edge_item["update_time"] = now_time
|
||||
self.graph.update_edge(edge_item)
|
||||
|
||||
# 更新新节点属性
|
||||
for src_tgt in node_to_node.keys():
|
||||
for node_hash in src_tgt:
|
||||
if node_hash not in existed_nodes:
|
||||
if node_hash.startswith("entity"):
|
||||
# 新增实体节点
|
||||
node = embedding_manager.entities_embedding_store.store.get(node_hash)
|
||||
if node is None:
|
||||
logger.warning(f"实体节点 {node_hash} 在嵌入库中不存在,跳过")
|
||||
continue
|
||||
assert isinstance(node, EmbeddingStoreItem)
|
||||
node_item = self.graph[node_hash]
|
||||
node_item["content"] = node.str
|
||||
node_item["type"] = "ent"
|
||||
node_item["create_time"] = now_time
|
||||
self.graph.update_node(node_item)
|
||||
elif node_hash.startswith("paragraph"):
|
||||
# 新增文段节点
|
||||
node = embedding_manager.paragraphs_embedding_store.store.get(node_hash)
|
||||
if node is None:
|
||||
logger.warning(f"段落节点 {node_hash} 在嵌入库中不存在,跳过")
|
||||
continue
|
||||
assert isinstance(node, EmbeddingStoreItem)
|
||||
content = node.str.replace("\n", " ")
|
||||
node_item = self.graph[node_hash]
|
||||
node_item["content"] = content if len(content) < 8 else content[:8] + "..."
|
||||
node_item["type"] = "pg"
|
||||
node_item["create_time"] = now_time
|
||||
self.graph.update_node(node_item)
|
||||
|
||||
def build_kg(
|
||||
self,
|
||||
triple_list_data: Dict[str, List[List[str]]],
|
||||
embedding_manager: EmbeddingManager,
|
||||
):
|
||||
"""增量式构建KG
|
||||
|
||||
注意:应当在调用该方法后保存KG
|
||||
|
||||
Args:
|
||||
triple_list_data: 三元组数据
|
||||
embedding_manager: EmbeddingManager对象
|
||||
"""
|
||||
# 实体之间的联系
|
||||
node_to_node = dict()
|
||||
|
||||
# 构建实体节点之间的关系,同时统计实体出现次数
|
||||
logger.info("正在构建KG实体节点之间的关系,同时统计实体出现次数")
|
||||
# 从三元组提取实体对
|
||||
self._build_edges_between_ent(node_to_node, triple_list_data)
|
||||
|
||||
# 构建实体节点与文段节点之间的关系
|
||||
logger.info("正在构建KG实体节点与文段节点之间的关系")
|
||||
self._build_edges_between_ent_pg(node_to_node, triple_list_data)
|
||||
|
||||
# 近义词扩展链接
|
||||
# 对每个实体节点,找到最相似的实体节点,建立扩展连接
|
||||
logger.info("正在进行近义词扩展链接")
|
||||
self._synonym_connect(node_to_node, triple_list_data, embedding_manager)
|
||||
|
||||
# 构建图
|
||||
self._update_graph(node_to_node, embedding_manager)
|
||||
|
||||
# 记录已处理(存储)的段落hash
|
||||
for idx in triple_list_data:
|
||||
self.stored_paragraph_hashes.add(str(idx))
|
||||
|
||||
def kg_search(
|
||||
self,
|
||||
relation_search_result: List[Tuple[Tuple[str, str, str], float]],
|
||||
paragraph_search_result: List[Tuple[str, float]],
|
||||
embed_manager: EmbeddingManager,
|
||||
):
|
||||
"""RAG搜索与PageRank
|
||||
|
||||
Args:
|
||||
relation_search_result: RelationEmbedding的搜索结果(relation_tripple, similarity)
|
||||
paragraph_search_result: ParagraphEmbedding的搜索结果(paragraph_hash, similarity)
|
||||
embed_manager: EmbeddingManager对象
|
||||
"""
|
||||
# 图中存在的节点总集
|
||||
existed_nodes = self.graph.get_node_list()
|
||||
|
||||
# 准备PPR使用的数据
|
||||
# 节点权重:实体
|
||||
ent_weights = {}
|
||||
# 节点权重:文段
|
||||
pg_weights = {}
|
||||
|
||||
# 以下部分处理实体权重ent_weights
|
||||
|
||||
# 针对每个关系,提取出其中的主宾短语作为两个实体,并记录对应的三元组的相似度作为权重依据
|
||||
ent_sim_scores = {}
|
||||
for relation_hash, similarity, _ in relation_search_result:
|
||||
# 提取主宾短语
|
||||
relation = embed_manager.relation_embedding_store.store.get(relation_hash).str
|
||||
assert relation is not None # 断言:relation不为空
|
||||
# 关系三元组
|
||||
triple = relation[2:-2].split("', '")
|
||||
for ent in [(triple[0]), (triple[2])]:
|
||||
ent_hash = "entity" + "-" + get_sha256(ent)
|
||||
if ent_hash in existed_nodes: # 该实体需在KG中存在
|
||||
if ent_hash not in ent_sim_scores: # 尚未记录的实体
|
||||
ent_sim_scores[ent_hash] = []
|
||||
ent_sim_scores[ent_hash].append(similarity)
|
||||
|
||||
ent_mean_scores = {} # 记录实体的平均相似度
|
||||
for ent_hash, scores in ent_sim_scores.items():
|
||||
# 先对相似度进行累加,然后与实体计数相除获取最终权重
|
||||
ent_weights[ent_hash] = float(np.sum(scores)) / self.ent_appear_cnt[ent_hash]
|
||||
# 记录实体的平均相似度,用于后续的top_k筛选
|
||||
ent_mean_scores[ent_hash] = float(np.mean(scores))
|
||||
del ent_sim_scores
|
||||
|
||||
ent_weights_max = max(ent_weights.values())
|
||||
ent_weights_min = min(ent_weights.values())
|
||||
if ent_weights_max == ent_weights_min:
|
||||
# 只有一个相似度,则全赋值为1
|
||||
for ent_hash in ent_weights.keys():
|
||||
ent_weights[ent_hash] = 1.0
|
||||
else:
|
||||
down_edge = global_config.lpmm_knowledge.qa_paragraph_node_weight
|
||||
# 缩放取值区间至[down_edge, 1]
|
||||
for ent_hash, score in ent_weights.items():
|
||||
# 缩放相似度
|
||||
ent_weights[ent_hash] = (
|
||||
(score - ent_weights_min) * (1 - down_edge) / (ent_weights_max - ent_weights_min)
|
||||
) + down_edge
|
||||
|
||||
# 取平均相似度的top_k实体
|
||||
top_k = global_config.lpmm_knowledge.qa_ent_filter_top_k
|
||||
if len(ent_mean_scores) > top_k:
|
||||
# 从大到小排序,取后len - k个
|
||||
ent_mean_scores = {k: v for k, v in sorted(ent_mean_scores.items(), key=lambda item: item[1], reverse=True)}
|
||||
for ent_hash, _ in ent_mean_scores.items():
|
||||
# 删除被淘汰的实体节点权重设置
|
||||
del ent_weights[ent_hash]
|
||||
del top_k, ent_mean_scores
|
||||
|
||||
# 以下部分处理文段权重pg_weights
|
||||
|
||||
# 将搜索结果中文段的相似度归一化作为权重
|
||||
pg_sim_scores = {}
|
||||
pg_sim_score_max = 0.0
|
||||
pg_sim_score_min = 1.0
|
||||
for pg_hash, similarity in paragraph_search_result:
|
||||
# 查找最大和最小值
|
||||
pg_sim_score_max = max(pg_sim_score_max, similarity)
|
||||
pg_sim_score_min = min(pg_sim_score_min, similarity)
|
||||
pg_sim_scores[pg_hash] = similarity
|
||||
|
||||
# 归一化
|
||||
for pg_hash, similarity in pg_sim_scores.items():
|
||||
# 归一化相似度
|
||||
pg_sim_scores[pg_hash] = (similarity - pg_sim_score_min) / (pg_sim_score_max - pg_sim_score_min)
|
||||
del pg_sim_score_max, pg_sim_score_min
|
||||
|
||||
for pg_hash, score in pg_sim_scores.items():
|
||||
pg_weights[pg_hash] = (
|
||||
score * global_config.lpmm_knowledge.qa_paragraph_node_weight
|
||||
) # 文段权重 = 归一化相似度 * 文段节点权重参数
|
||||
del pg_sim_scores
|
||||
|
||||
# 最终权重数据 = 实体权重 + 文段权重
|
||||
ppr_node_weights = {k: v for d in [ent_weights, pg_weights] for k, v in d.items()}
|
||||
del ent_weights, pg_weights
|
||||
|
||||
# PersonalizedPageRank
|
||||
ppr_res = pagerank.run_pagerank(
|
||||
self.graph,
|
||||
personalization=ppr_node_weights,
|
||||
max_iter=100,
|
||||
alpha=global_config.lpmm_knowledge.qa_ppr_damping,
|
||||
)
|
||||
|
||||
# 获取最终结果
|
||||
# 从搜索结果中提取文段节点的结果
|
||||
passage_node_res = [
|
||||
(node_key, score)
|
||||
for node_key, score in ppr_res.items()
|
||||
if node_key.startswith("paragraph")
|
||||
]
|
||||
del ppr_res
|
||||
|
||||
# 排序:按照分数从大到小
|
||||
passage_node_res = sorted(passage_node_res, key=lambda item: item[1], reverse=True)
|
||||
|
||||
return passage_node_res, ppr_node_weights
|
||||
79
src/chat/knowledge/knowledge_lib.py
Normal file
79
src/chat/knowledge/knowledge_lib.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from src.chat.knowledge.embedding_store import EmbeddingManager
|
||||
from src.chat.knowledge.qa_manager import QAManager
|
||||
from src.chat.knowledge.kg_manager import KGManager
|
||||
from src.chat.knowledge.global_logger import logger
|
||||
from src.config.config import global_config
|
||||
import os
|
||||
|
||||
INVALID_ENTITY = [
|
||||
"",
|
||||
"你",
|
||||
"他",
|
||||
"她",
|
||||
"它",
|
||||
"我们",
|
||||
"你们",
|
||||
"他们",
|
||||
"她们",
|
||||
"它们",
|
||||
]
|
||||
|
||||
RAG_GRAPH_NAMESPACE = "rag-graph"
|
||||
RAG_ENT_CNT_NAMESPACE = "rag-ent-cnt"
|
||||
RAG_PG_HASH_NAMESPACE = "rag-pg-hash"
|
||||
|
||||
|
||||
ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
DATA_PATH = os.path.join(ROOT_PATH, "data")
|
||||
|
||||
|
||||
qa_manager = None
|
||||
inspire_manager = None
|
||||
|
||||
# 检查LPMM知识库是否启用
|
||||
if global_config.lpmm_knowledge.enable:
|
||||
logger.info("正在初始化Mai-LPMM")
|
||||
logger.info("创建LLM客户端")
|
||||
|
||||
# 初始化Embedding库
|
||||
embed_manager = EmbeddingManager()
|
||||
logger.info("正在从文件加载Embedding库")
|
||||
try:
|
||||
embed_manager.load_from_file()
|
||||
except Exception as e:
|
||||
logger.warning(f"此消息不会影响正常使用:从文件加载Embedding库时,{e}")
|
||||
# logger.warning("如果你是第一次导入知识,或者还未导入知识,请忽略此错误")
|
||||
logger.info("Embedding库加载完成")
|
||||
# 初始化KG
|
||||
kg_manager = KGManager()
|
||||
logger.info("正在从文件加载KG")
|
||||
try:
|
||||
kg_manager.load_from_file()
|
||||
except Exception as e:
|
||||
logger.warning(f"此消息不会影响正常使用:从文件加载KG时,{e}")
|
||||
# logger.warning("如果你是第一次导入知识,或者还未导入知识,请忽略此错误")
|
||||
logger.info("KG加载完成")
|
||||
|
||||
logger.info(f"KG节点数量:{len(kg_manager.graph.get_node_list())}")
|
||||
logger.info(f"KG边数量:{len(kg_manager.graph.get_edge_list())}")
|
||||
|
||||
# 数据比对:Embedding库与KG的段落hash集合
|
||||
for pg_hash in kg_manager.stored_paragraph_hashes:
|
||||
key = f"paragraph-{pg_hash}"
|
||||
if key not in embed_manager.stored_pg_hashes:
|
||||
logger.warning(f"KG中存在Embedding库中不存在的段落:{key}")
|
||||
|
||||
# 问答系统(用于知识库)
|
||||
qa_manager = QAManager(
|
||||
embed_manager,
|
||||
kg_manager,
|
||||
)
|
||||
|
||||
# # 记忆激活(用于记忆库)
|
||||
# inspire_manager = MemoryActiveManager(
|
||||
# embed_manager,
|
||||
# llm_client_list[global_config["embedding"]["provider"]],
|
||||
# )
|
||||
else:
|
||||
logger.info("LPMM知识库已禁用,跳过初始化")
|
||||
# 创建空的占位符对象,避免导入错误
|
||||
154
src/chat/knowledge/open_ie.py
Normal file
154
src/chat/knowledge/open_ie.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
from .knowledge_lib import INVALID_ENTITY, ROOT_PATH, DATA_PATH
|
||||
# from src.manager.local_store_manager import local_storage
|
||||
|
||||
|
||||
def _filter_invalid_entities(entities: List[str]) -> List[str]:
|
||||
"""过滤无效的实体"""
|
||||
valid_entities = set()
|
||||
for entity in entities:
|
||||
if not isinstance(entity, str) or entity.strip() == "" or entity in INVALID_ENTITY or entity in valid_entities:
|
||||
# 非字符串/空字符串/在无效实体列表中/重复
|
||||
continue
|
||||
valid_entities.add(entity)
|
||||
|
||||
return list(valid_entities)
|
||||
|
||||
|
||||
def _filter_invalid_triples(triples: List[List[str]]) -> List[List[str]]:
|
||||
"""过滤无效的三元组"""
|
||||
unique_triples = set()
|
||||
valid_triples = []
|
||||
|
||||
for triple in triples:
|
||||
if len(triple) != 3 or (
|
||||
(not isinstance(triple[0], str) or triple[0].strip() == "")
|
||||
or (not isinstance(triple[1], str) or triple[1].strip() == "")
|
||||
or (not isinstance(triple[2], str) or triple[2].strip() == "")
|
||||
):
|
||||
# 三元组长度不为3,或其中存在空值
|
||||
continue
|
||||
|
||||
valid_triple = [str(item) for item in triple]
|
||||
if tuple(valid_triple) not in unique_triples:
|
||||
unique_triples.add(tuple(valid_triple))
|
||||
valid_triples.append(valid_triple)
|
||||
|
||||
return valid_triples
|
||||
|
||||
|
||||
class OpenIE:
|
||||
"""
|
||||
OpenIE规约的数据格式为如下
|
||||
{
|
||||
"docs": [
|
||||
{
|
||||
"idx": "文档的唯一标识符(通常是文本的SHA256哈希值)",
|
||||
"passage": "文档的原始文本",
|
||||
"extracted_entities": ["实体1", "实体2", ...],
|
||||
"extracted_triples": [["主语", "谓语", "宾语"], ...]
|
||||
},
|
||||
...
|
||||
],
|
||||
"avg_ent_chars": "实体平均字符数",
|
||||
"avg_ent_words": "实体平均词数"
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
docs: List[Dict[str, Any]],
|
||||
avg_ent_chars,
|
||||
avg_ent_words,
|
||||
):
|
||||
self.docs = docs
|
||||
self.avg_ent_chars = avg_ent_chars
|
||||
self.avg_ent_words = avg_ent_words
|
||||
|
||||
for doc in self.docs:
|
||||
# 过滤实体列表
|
||||
doc["extracted_entities"] = _filter_invalid_entities(doc["extracted_entities"])
|
||||
# 过滤无效的三元组
|
||||
doc["extracted_triples"] = _filter_invalid_triples(doc["extracted_triples"])
|
||||
|
||||
@staticmethod
|
||||
def _from_dict(data_list):
|
||||
"""从多个字典合并OpenIE对象"""
|
||||
# data_list: List[dict]
|
||||
all_docs = []
|
||||
for data in data_list:
|
||||
all_docs.extend(data.get("docs", []))
|
||||
# 重新计算统计
|
||||
sum_phrase_chars = sum([len(e) for chunk in all_docs for e in chunk["extracted_entities"]])
|
||||
sum_phrase_words = sum([len(e.split()) for chunk in all_docs for e in chunk["extracted_entities"]])
|
||||
num_phrases = sum([len(chunk["extracted_entities"]) for chunk in all_docs])
|
||||
avg_ent_chars = round(sum_phrase_chars / num_phrases, 4) if num_phrases else 0
|
||||
avg_ent_words = round(sum_phrase_words / num_phrases, 4) if num_phrases else 0
|
||||
return OpenIE(
|
||||
docs=all_docs,
|
||||
avg_ent_chars=avg_ent_chars,
|
||||
avg_ent_words=avg_ent_words,
|
||||
)
|
||||
|
||||
def _to_dict(self):
|
||||
"""转换为字典"""
|
||||
return {
|
||||
"docs": self.docs,
|
||||
"avg_ent_chars": self.avg_ent_chars,
|
||||
"avg_ent_words": self.avg_ent_words,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def load() -> "OpenIE":
|
||||
"""从OPENIE_DIR下所有json文件合并加载OpenIE数据"""
|
||||
openie_dir = os.path.join(DATA_PATH, "openie")
|
||||
if not os.path.exists(openie_dir):
|
||||
raise Exception(f"OpenIE数据目录不存在: {openie_dir}")
|
||||
json_files = sorted(glob.glob(os.path.join(openie_dir, "*.json")))
|
||||
data_list = []
|
||||
for file in json_files:
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
data_list.append(data)
|
||||
if not data_list:
|
||||
# print(f"111111111111111111111Root Path : \n{ROOT_PATH}")
|
||||
raise Exception(f"未在 {openie_dir} 找到任何OpenIE json文件")
|
||||
openie_data = OpenIE._from_dict(data_list)
|
||||
return openie_data
|
||||
|
||||
def extract_entity_dict(self):
|
||||
"""提取实体列表"""
|
||||
ner_output_dict = dict(
|
||||
{
|
||||
doc_item["idx"]: doc_item["extracted_entities"]
|
||||
for doc_item in self.docs
|
||||
if len(doc_item["extracted_entities"]) > 0
|
||||
}
|
||||
)
|
||||
return ner_output_dict
|
||||
|
||||
def extract_triple_dict(self):
|
||||
"""提取三元组列表"""
|
||||
triple_output_dict = dict(
|
||||
{
|
||||
doc_item["idx"]: doc_item["extracted_triples"]
|
||||
for doc_item in self.docs
|
||||
if len(doc_item["extracted_triples"]) > 0
|
||||
}
|
||||
)
|
||||
return triple_output_dict
|
||||
|
||||
def extract_raw_paragraph_dict(self):
|
||||
"""提取原始段落"""
|
||||
raw_paragraph_dict = dict({doc_item["idx"]: doc_item["passage"] for doc_item in self.docs})
|
||||
return raw_paragraph_dict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
print(ROOT_PATH)
|
||||
70
src/chat/knowledge/prompt_template.py
Normal file
70
src/chat/knowledge/prompt_template.py
Normal file
@@ -0,0 +1,70 @@
|
||||
entity_extract_system_prompt = """你是一个性能优异的实体提取系统。请从段落中提取出所有实体,并以JSON列表的形式输出。
|
||||
|
||||
输出格式示例:
|
||||
[ "实体A", "实体B", "实体C" ]
|
||||
|
||||
请注意以下要求:
|
||||
- 将代词(如“你”、“我”、“他”、“她”、“它”等)转化为对应的实体命名,以避免指代不清。
|
||||
- 尽可能多的提取出段落中的全部实体;
|
||||
"""
|
||||
|
||||
|
||||
def build_entity_extract_context(paragraph: str) -> str:
|
||||
"""构建实体提取的完整提示文本"""
|
||||
return f"""{entity_extract_system_prompt}
|
||||
|
||||
段落:
|
||||
```
|
||||
{paragraph}
|
||||
```"""
|
||||
|
||||
|
||||
rdf_triple_extract_system_prompt = """你是一个性能优异的RDF(资源描述框架,由节点和边组成,节点表示实体/资源、属性,边则表示了实体和实体之间的关系以及实体和属性的关系。)构造系统。你的任务是根据给定的段落和实体列表构建RDF图。
|
||||
|
||||
请使用JSON回复,使用三元组的JSON列表输出RDF图中的关系(每个三元组代表一个关系)。
|
||||
|
||||
输出格式示例:
|
||||
[
|
||||
["某实体","关系","某属性"],
|
||||
["某实体","关系","某实体"],
|
||||
["某资源","关系","某属性"]
|
||||
]
|
||||
|
||||
请注意以下要求:
|
||||
- 每个三元组应包含每个段落的实体命名列表中的至少一个命名实体,但最好是两个。
|
||||
- 将代词(如“你”、“我”、“他”、“她”、“它”等)转化为对应的实体命名,以避免指代不清。
|
||||
"""
|
||||
|
||||
|
||||
def build_rdf_triple_extract_context(paragraph: str, entities: str) -> str:
|
||||
"""构建RDF三元组提取的完整提示文本"""
|
||||
return f"""{rdf_triple_extract_system_prompt}
|
||||
|
||||
段落:
|
||||
```
|
||||
{paragraph}
|
||||
```
|
||||
|
||||
实体列表:
|
||||
```
|
||||
{entities}
|
||||
```"""
|
||||
|
||||
|
||||
qa_system_prompt = """
|
||||
你是一个性能优异的QA系统。请根据给定的问题和一些可能对你有帮助的信息作出回答。
|
||||
|
||||
请注意以下要求:
|
||||
- 你可以使用给定的信息来回答问题,但请不要直接引用它们。
|
||||
- 你的回答应该简洁明了,避免冗长的解释。
|
||||
- 如果你无法回答问题,请直接说“我不知道”。
|
||||
"""
|
||||
|
||||
|
||||
# def build_qa_context(question: str, knowledge: list[tuple[str, str, str]]) -> list[LLMMessage]:
|
||||
# knowledge = "\n".join([f"{i + 1}. 相关性:{k[0]}\n{k[1]}" for i, k in enumerate(knowledge)])
|
||||
# messages = [
|
||||
# LLMMessage("system", qa_system_prompt).to_dict(),
|
||||
# LLMMessage("user", f"问题:\n{question}\n\n可能有帮助的信息:\n{knowledge}").to_dict(),
|
||||
# ]
|
||||
# return messages
|
||||
124
src/chat/knowledge/qa_manager.py
Normal file
124
src/chat/knowledge/qa_manager.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import time
|
||||
from typing import Tuple, List, Dict, Optional
|
||||
|
||||
from .global_logger import logger
|
||||
from .embedding_store import EmbeddingManager
|
||||
from .kg_manager import KGManager
|
||||
|
||||
# from .lpmmconfig import global_config
|
||||
from .utils.dyn_topk import dyn_select_top_k
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.chat.utils.utils import get_embedding
|
||||
from src.config.config import global_config, model_config
|
||||
|
||||
MAX_KNOWLEDGE_LENGTH = 10000 # 最大知识长度
|
||||
|
||||
|
||||
class QAManager:
|
||||
def __init__(
|
||||
self,
|
||||
embed_manager: EmbeddingManager,
|
||||
kg_manager: KGManager,
|
||||
):
|
||||
self.embed_manager = embed_manager
|
||||
self.kg_manager = kg_manager
|
||||
self.qa_model = LLMRequest(model_set=model_config.model_task_config.lpmm_qa, request_type="lpmm.qa")
|
||||
|
||||
async def process_query(self, question: str) -> Optional[Tuple[List[Tuple[str, float, float]], Optional[Dict[str, float]]]]:
|
||||
"""处理查询"""
|
||||
|
||||
# 生成问题的Embedding
|
||||
part_start_time = time.perf_counter()
|
||||
question_embedding = await get_embedding(question)
|
||||
if question_embedding is None:
|
||||
logger.error("生成问题Embedding失败")
|
||||
return None
|
||||
part_end_time = time.perf_counter()
|
||||
logger.debug(f"Embedding用时:{part_end_time - part_start_time:.5f}s")
|
||||
|
||||
# 根据问题Embedding查询Relation Embedding库
|
||||
part_start_time = time.perf_counter()
|
||||
relation_search_res = self.embed_manager.relation_embedding_store.search_top_k(
|
||||
question_embedding,
|
||||
global_config.lpmm_knowledge.qa_relation_search_top_k,
|
||||
)
|
||||
if relation_search_res is None:
|
||||
return None
|
||||
# 过滤阈值
|
||||
# 考虑动态阈值:当存在显著数值差异的结果时,保留显著结果;否则,保留所有结果
|
||||
relation_search_res = dyn_select_top_k(relation_search_res, 0.5, 1.0)
|
||||
if not relation_search_res or relation_search_res[0][1] < global_config.lpmm_knowledge.qa_relation_threshold:
|
||||
# 未找到相关关系
|
||||
logger.debug("未找到相关关系,跳过关系检索")
|
||||
relation_search_res = []
|
||||
|
||||
part_end_time = time.perf_counter()
|
||||
logger.debug(f"关系检索用时:{part_end_time - part_start_time:.5f}s")
|
||||
|
||||
for res in relation_search_res:
|
||||
rel_str = self.embed_manager.relation_embedding_store.store.get(res[0]).str
|
||||
print(f"找到相关关系,相似度:{(res[1] * 100):.2f}% - {rel_str}")
|
||||
|
||||
# TODO: 使用LLM过滤三元组结果
|
||||
# logger.info(f"LLM过滤三元组用时:{time.time() - part_start_time:.2f}s")
|
||||
# part_start_time = time.time()
|
||||
|
||||
# 根据问题Embedding查询Paragraph Embedding库
|
||||
part_start_time = time.perf_counter()
|
||||
paragraph_search_res = self.embed_manager.paragraphs_embedding_store.search_top_k(
|
||||
question_embedding,
|
||||
global_config.lpmm_knowledge.qa_paragraph_search_top_k,
|
||||
)
|
||||
part_end_time = time.perf_counter()
|
||||
logger.debug(f"文段检索用时:{part_end_time - part_start_time:.5f}s")
|
||||
|
||||
if len(relation_search_res) != 0:
|
||||
logger.info("找到相关关系,将使用RAG进行检索")
|
||||
# 使用KG检索
|
||||
part_start_time = time.perf_counter()
|
||||
result, ppr_node_weights = self.kg_manager.kg_search(
|
||||
relation_search_res, paragraph_search_res, self.embed_manager
|
||||
)
|
||||
part_end_time = time.perf_counter()
|
||||
logger.info(f"RAG检索用时:{part_end_time - part_start_time:.5f}s")
|
||||
else:
|
||||
logger.info("未找到相关关系,将使用文段检索结果")
|
||||
result = paragraph_search_res
|
||||
ppr_node_weights = None
|
||||
|
||||
# 过滤阈值
|
||||
result = dyn_select_top_k(result, 0.5, 1.0)
|
||||
|
||||
for res in result:
|
||||
raw_paragraph = self.embed_manager.paragraphs_embedding_store.store[res[0]].str
|
||||
print(f"找到相关文段,相关系数:{res[1]:.8f}\n{raw_paragraph}\n\n")
|
||||
|
||||
return result, ppr_node_weights
|
||||
|
||||
async def get_knowledge(self, question: str) -> Optional[str]:
|
||||
"""获取知识"""
|
||||
# 处理查询
|
||||
processed_result = await self.process_query(question)
|
||||
if processed_result is not None:
|
||||
query_res = processed_result[0]
|
||||
# 检查查询结果是否为空
|
||||
if not query_res:
|
||||
logger.debug("知识库查询结果为空,可能是知识库中没有相关内容")
|
||||
return None
|
||||
|
||||
knowledge = [
|
||||
(
|
||||
self.embed_manager.paragraphs_embedding_store.store[res[0]].str,
|
||||
res[1],
|
||||
)
|
||||
for res in query_res
|
||||
]
|
||||
found_knowledge = "\n".join(
|
||||
[f"第{i + 1}条知识:{k[0]}\n 该条知识对于问题的相关性:{k[1]}" for i, k in enumerate(knowledge)]
|
||||
)
|
||||
if len(found_knowledge) > MAX_KNOWLEDGE_LENGTH:
|
||||
found_knowledge = found_knowledge[:MAX_KNOWLEDGE_LENGTH] + "\n"
|
||||
return found_knowledge
|
||||
else:
|
||||
logger.debug("LPMM知识库并未初始化,可能是从未导入过知识...")
|
||||
return None
|
||||
0
src/chat/knowledge/utils/__init__.py
Normal file
0
src/chat/knowledge/utils/__init__.py
Normal file
51
src/chat/knowledge/utils/dyn_topk.py
Normal file
51
src/chat/knowledge/utils/dyn_topk.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from typing import List, Any, Tuple
|
||||
|
||||
|
||||
def dyn_select_top_k(
|
||||
score: List[Tuple[Any, float]], jmp_factor: float, var_factor: float
|
||||
) -> List[Tuple[Any, float, float]]:
|
||||
"""动态TopK选择"""
|
||||
# 检查输入列表是否为空
|
||||
if not score:
|
||||
return []
|
||||
|
||||
# 按照分数排序(降序)
|
||||
sorted_score = sorted(score, key=lambda x: x[1], reverse=True)
|
||||
|
||||
# 归一化
|
||||
max_score = sorted_score[0][1]
|
||||
min_score = sorted_score[-1][1]
|
||||
normalized_score = []
|
||||
for score_item in sorted_score:
|
||||
normalized_score.append(
|
||||
tuple(
|
||||
[
|
||||
score_item[0],
|
||||
score_item[1],
|
||||
(score_item[1] - min_score) / (max_score - min_score),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
# 寻找跳变点:score变化最大的位置
|
||||
jump_idx = 0
|
||||
for i in range(1, len(normalized_score)):
|
||||
if abs(normalized_score[i][2] - normalized_score[i - 1][2]) > abs(
|
||||
normalized_score[jump_idx][2] - normalized_score[jump_idx - 1][2]
|
||||
):
|
||||
jump_idx = i
|
||||
# 跳变阈值
|
||||
jump_threshold = normalized_score[jump_idx][2]
|
||||
|
||||
# 计算均值
|
||||
mean_score = sum([s[2] for s in normalized_score]) / len(normalized_score)
|
||||
# 计算方差
|
||||
var_score = sum([(s[2] - mean_score) ** 2 for s in normalized_score]) / len(normalized_score)
|
||||
|
||||
# 动态阈值
|
||||
threshold = jmp_factor * jump_threshold + (1 - jmp_factor) * (mean_score + var_factor * var_score)
|
||||
|
||||
# 重新过滤
|
||||
res = [s for s in normalized_score if s[2] > threshold]
|
||||
|
||||
return res
|
||||
8
src/chat/knowledge/utils/hash.py
Normal file
8
src/chat/knowledge/utils/hash.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import hashlib
|
||||
|
||||
|
||||
def get_sha256(string: str) -> str:
|
||||
"""获取字符串的SHA256值"""
|
||||
sha256 = hashlib.sha256()
|
||||
sha256.update(string.encode("utf-8"))
|
||||
return sha256.hexdigest()
|
||||
98
src/chat/knowledge/utils/json_fix.py
Normal file
98
src/chat/knowledge/utils/json_fix.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import json
|
||||
from json_repair import repair_json
|
||||
|
||||
|
||||
def _find_unclosed(json_str):
|
||||
"""
|
||||
Identifies the unclosed braces and brackets in the JSON string.
|
||||
|
||||
Args:
|
||||
json_str (str): The JSON string to analyze.
|
||||
|
||||
Returns:
|
||||
list: A list of unclosed elements in the order they were opened.
|
||||
"""
|
||||
unclosed = []
|
||||
inside_string = False
|
||||
escape_next = False
|
||||
|
||||
for char in json_str:
|
||||
if inside_string:
|
||||
if escape_next:
|
||||
escape_next = False
|
||||
elif char == "\\":
|
||||
escape_next = True
|
||||
elif char == '"':
|
||||
inside_string = False
|
||||
else:
|
||||
if char == '"':
|
||||
inside_string = True
|
||||
elif char in "{[":
|
||||
unclosed.append(char)
|
||||
elif char in "}]":
|
||||
if unclosed and ((char == "}" and unclosed[-1] == "{") or (char == "]" and unclosed[-1] == "[")):
|
||||
unclosed.pop()
|
||||
|
||||
return unclosed
|
||||
|
||||
|
||||
# The following code is used to fix a broken JSON string.
|
||||
# From HippoRAG2 (GitHub: OSU-NLP-Group/HippoRAG)
|
||||
def fix_broken_generated_json(json_str: str) -> str:
|
||||
"""
|
||||
Fixes a malformed JSON string by:
|
||||
- Removing the last comma and any trailing content.
|
||||
- Iterating over the JSON string once to determine and fix unclosed braces or brackets.
|
||||
- Ensuring braces and brackets inside string literals are not considered.
|
||||
|
||||
If the original json_str string can be successfully loaded by json.loads(), will directly return it without any modification.
|
||||
|
||||
Args:
|
||||
json_str (str): The malformed JSON string to be fixed.
|
||||
|
||||
Returns:
|
||||
str: The corrected JSON string.
|
||||
"""
|
||||
|
||||
try:
|
||||
# Try to load the JSON to see if it is valid
|
||||
json.loads(json_str)
|
||||
return json_str # Return as-is if valid
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Step 1: Remove trailing content after the last comma.
|
||||
last_comma_index = json_str.rfind(",")
|
||||
if last_comma_index != -1:
|
||||
json_str = json_str[:last_comma_index]
|
||||
|
||||
# Step 2: Identify unclosed braces and brackets.
|
||||
unclosed_elements = _find_unclosed(json_str)
|
||||
|
||||
# Step 3: Append the necessary closing elements in reverse order of opening.
|
||||
closing_map = {"{": "}", "[": "]"}
|
||||
for open_char in reversed(unclosed_elements):
|
||||
json_str += closing_map[open_char]
|
||||
|
||||
return json_str
|
||||
|
||||
|
||||
def new_fix_broken_generated_json(json_str: str) -> str:
|
||||
"""
|
||||
使用 json-repair 库修复格式错误的 JSON 字符串。
|
||||
|
||||
如果原始 json_str 字符串可以被 json.loads() 成功加载,则直接返回而不进行任何修改。
|
||||
|
||||
参数:
|
||||
json_str (str): 需要修复的格式错误的 JSON 字符串。
|
||||
|
||||
返回:
|
||||
str: 修复后的 JSON 字符串。
|
||||
"""
|
||||
try:
|
||||
# 尝试加载 JSON 以查看其是否有效
|
||||
json.loads(json_str)
|
||||
return json_str # 如果有效则按原样返回
|
||||
except json.JSONDecodeError:
|
||||
# 如果无效,则尝试修复它
|
||||
return repair_json(json_str)
|
||||
1716
src/chat/memory_system/Hippocampus.py
Normal file
1716
src/chat/memory_system/Hippocampus.py
Normal file
File diff suppressed because it is too large
Load Diff
254
src/chat/memory_system/instant_memory.py
Normal file
254
src/chat/memory_system/instant_memory.py
Normal file
@@ -0,0 +1,254 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
import ast
|
||||
import traceback
|
||||
|
||||
from json_repair import repair_json
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.common.logger import get_logger
|
||||
from src.common.database.sqlalchemy_models import Memory # SQLAlchemy Models导入
|
||||
from src.common.database.sqlalchemy_database_api import get_session
|
||||
from src.config.config import model_config
|
||||
|
||||
from sqlalchemy import select
|
||||
logger = get_logger(__name__)
|
||||
session = get_session()
|
||||
|
||||
class MemoryItem:
|
||||
def __init__(self, memory_id: str, chat_id: str, memory_text: str, keywords: list[str]):
|
||||
self.memory_id = memory_id
|
||||
self.chat_id = chat_id
|
||||
self.memory_text: str = memory_text
|
||||
self.keywords: list[str] = keywords
|
||||
self.create_time: float = time.time()
|
||||
self.last_view_time: float = time.time()
|
||||
|
||||
|
||||
class MemoryManager:
|
||||
def __init__(self):
|
||||
# self.memory_items:list[MemoryItem] = []
|
||||
pass
|
||||
|
||||
|
||||
class InstantMemory:
|
||||
def __init__(self, chat_id):
|
||||
self.chat_id = chat_id
|
||||
self.last_view_time = time.time()
|
||||
self.summary_model = LLMRequest(
|
||||
model_set=model_config.model_task_config.utils,
|
||||
request_type="memory.summary",
|
||||
)
|
||||
|
||||
async def if_need_build(self, text):
|
||||
prompt = f"""
|
||||
请判断以下内容中是否有值得记忆的信息,如果有,请输出1,否则输出0
|
||||
{text}
|
||||
请只输出1或0就好
|
||||
"""
|
||||
|
||||
try:
|
||||
response, _ = await self.summary_model.generate_response_async(prompt, temperature=0.5)
|
||||
print(prompt)
|
||||
print(response)
|
||||
|
||||
return "1" in response
|
||||
except Exception as e:
|
||||
logger.error(f"判断是否需要记忆出现错误:{str(e)} {traceback.format_exc()}")
|
||||
return False
|
||||
|
||||
async def build_memory(self, text):
|
||||
prompt = f"""
|
||||
以下内容中存在值得记忆的信息,请你从中总结出一段值得记忆的信息,并输出
|
||||
{text}
|
||||
请以json格式输出一段概括的记忆内容和关键词
|
||||
{{
|
||||
"memory_text": "记忆内容",
|
||||
"keywords": "关键词,用/划分"
|
||||
}}
|
||||
"""
|
||||
try:
|
||||
response, _ = await self.summary_model.generate_response_async(prompt, temperature=0.5)
|
||||
# print(prompt)
|
||||
# print(response)
|
||||
if not response:
|
||||
return None
|
||||
try:
|
||||
repaired = repair_json(response)
|
||||
result = json.loads(repaired)
|
||||
memory_text = result.get("memory_text", "")
|
||||
keywords = result.get("keywords", "")
|
||||
if isinstance(keywords, str):
|
||||
keywords_list = [k.strip() for k in keywords.split("/") if k.strip()]
|
||||
elif isinstance(keywords, list):
|
||||
keywords_list = keywords
|
||||
else:
|
||||
keywords_list = []
|
||||
return {"memory_text": memory_text, "keywords": keywords_list}
|
||||
except Exception as parse_e:
|
||||
logger.error(f"解析记忆json失败:{str(parse_e)} {traceback.format_exc()}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"构建记忆出现错误:{str(e)} {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
async def create_and_store_memory(self, text):
|
||||
if_need = await self.if_need_build(text)
|
||||
if if_need:
|
||||
logger.info(f"需要记忆:{text}")
|
||||
memory = await self.build_memory(text)
|
||||
if memory and memory.get("memory_text"):
|
||||
memory_id = f"{self.chat_id}_{time.time()}"
|
||||
memory_item = MemoryItem(
|
||||
memory_id=memory_id,
|
||||
chat_id=self.chat_id,
|
||||
memory_text=memory["memory_text"],
|
||||
keywords=memory.get("keywords", []),
|
||||
)
|
||||
await self.store_memory(memory_item)
|
||||
else:
|
||||
logger.info(f"不需要记忆:{text}")
|
||||
|
||||
async def store_memory(self, memory_item: MemoryItem):
|
||||
memory = Memory(
|
||||
memory_id=memory_item.memory_id,
|
||||
chat_id=memory_item.chat_id,
|
||||
memory_text=memory_item.memory_text,
|
||||
keywords=memory_item.keywords,
|
||||
create_time=memory_item.create_time,
|
||||
last_view_time=memory_item.last_view_time,
|
||||
)
|
||||
session.add(memory)
|
||||
session.commit()
|
||||
|
||||
async def get_memory(self, target: str):
|
||||
from json_repair import repair_json
|
||||
|
||||
prompt = f"""
|
||||
请根据以下发言内容,判断是否需要提取记忆
|
||||
{target}
|
||||
请用json格式输出,包含以下字段:
|
||||
其中,time的要求是:
|
||||
可以选择具体日期时间,格式为YYYY-MM-DD HH:MM:SS,或者大致时间,格式为YYYY-MM-DD
|
||||
可以选择相对时间,例如:今天,昨天,前天,5天前,1个月前
|
||||
可以选择留空进行模糊搜索
|
||||
{{
|
||||
"need_memory": 1,
|
||||
"keywords": "希望获取的记忆关键词,用/划分",
|
||||
"time": "希望获取的记忆大致时间"
|
||||
}}
|
||||
请只输出json格式,不要输出其他多余内容
|
||||
"""
|
||||
try:
|
||||
response, _ = await self.summary_model.generate_response_async(prompt, temperature=0.5)
|
||||
print(prompt)
|
||||
print(response)
|
||||
if not response:
|
||||
return None
|
||||
try:
|
||||
repaired = repair_json(response)
|
||||
result = json.loads(repaired)
|
||||
# 解析keywords
|
||||
keywords = result.get("keywords", "")
|
||||
if isinstance(keywords, str):
|
||||
keywords_list = [k.strip() for k in keywords.split("/") if k.strip()]
|
||||
elif isinstance(keywords, list):
|
||||
keywords_list = keywords
|
||||
else:
|
||||
keywords_list = []
|
||||
# 解析time为时间段
|
||||
time_str = result.get("time", "").strip()
|
||||
start_time, end_time = self._parse_time_range(time_str)
|
||||
logger.info(f"start_time: {start_time}, end_time: {end_time}")
|
||||
# 检索包含关键词的记忆
|
||||
memories_set = set()
|
||||
if start_time and end_time:
|
||||
start_ts = start_time.timestamp()
|
||||
end_ts = end_time.timestamp()
|
||||
query = session.execute(select(Memory).where(
|
||||
(Memory.chat_id == self.chat_id)
|
||||
& (Memory.create_time >= start_ts)
|
||||
& (Memory.create_time < end_ts)
|
||||
)).scalars()
|
||||
else:
|
||||
query = session.execute(select(Memory).where(Memory.chat_id == self.chat_id)).scalars()
|
||||
|
||||
for mem in query:
|
||||
# 对每条记忆
|
||||
mem_keywords = mem.keywords or ""
|
||||
parsed = ast.literal_eval(mem_keywords)
|
||||
if isinstance(parsed, list):
|
||||
mem_keywords = [str(k).strip() for k in parsed if str(k).strip()]
|
||||
else:
|
||||
mem_keywords = []
|
||||
# logger.info(f"mem_keywords: {mem_keywords}")
|
||||
# logger.info(f"keywords_list: {keywords_list}")
|
||||
for kw in keywords_list:
|
||||
# logger.info(f"kw: {kw}")
|
||||
# logger.info(f"kw in mem_keywords: {kw in mem_keywords}")
|
||||
if kw in mem_keywords:
|
||||
# logger.info(f"mem.memory_text: {mem.memory_text}")
|
||||
memories_set.add(mem.memory_text)
|
||||
break
|
||||
return list(memories_set)
|
||||
except Exception as parse_e:
|
||||
logger.error(f"解析记忆json失败:{str(parse_e)} {traceback.format_exc()}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"获取记忆出现错误:{str(e)} {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
def _parse_time_range(self, time_str):
|
||||
# sourcery skip: extract-duplicate-method, use-contextlib-suppress
|
||||
"""
|
||||
支持解析如下格式:
|
||||
- 具体日期时间:YYYY-MM-DD HH:MM:SS
|
||||
- 具体日期:YYYY-MM-DD
|
||||
- 相对时间:今天,昨天,前天,N天前,N个月前
|
||||
- 空字符串:返回(None, None)
|
||||
"""
|
||||
now = datetime.now()
|
||||
if not time_str:
|
||||
return 0, now
|
||||
time_str = time_str.strip()
|
||||
# 具体日期时间
|
||||
try:
|
||||
dt = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
|
||||
return dt, dt + timedelta(hours=1)
|
||||
except Exception:
|
||||
pass
|
||||
# 具体日期
|
||||
try:
|
||||
dt = datetime.strptime(time_str, "%Y-%m-%d")
|
||||
return dt, dt + timedelta(days=1)
|
||||
except Exception:
|
||||
pass
|
||||
# 相对时间
|
||||
if time_str == "今天":
|
||||
start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
end = start + timedelta(days=1)
|
||||
return start, end
|
||||
if time_str == "昨天":
|
||||
start = (now - timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
end = start + timedelta(days=1)
|
||||
return start, end
|
||||
if time_str == "前天":
|
||||
start = (now - timedelta(days=2)).replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
end = start + timedelta(days=1)
|
||||
return start, end
|
||||
if m := re.match(r"(\d+)天前", time_str):
|
||||
days = int(m.group(1))
|
||||
start = (now - timedelta(days=days)).replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
end = start + timedelta(days=1)
|
||||
return start, end
|
||||
if m := re.match(r"(\d+)个月前", time_str):
|
||||
months = int(m.group(1))
|
||||
# 近似每月30天
|
||||
start = (now - timedelta(days=months * 30)).replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
end = start + timedelta(days=1)
|
||||
return start, end
|
||||
# 其他无法解析
|
||||
return 0, now
|
||||
144
src/chat/memory_system/memory_activator.py
Normal file
144
src/chat/memory_system/memory_activator.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import difflib
|
||||
import json
|
||||
|
||||
from json_repair import repair_json
|
||||
from typing import List, Dict
|
||||
from datetime import datetime
|
||||
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import global_config, model_config
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||
from src.chat.memory_system.Hippocampus import hippocampus_manager
|
||||
|
||||
|
||||
logger = get_logger("memory_activator")
|
||||
|
||||
|
||||
def get_keywords_from_json(json_str) -> List:
|
||||
"""
|
||||
从JSON字符串中提取关键词列表
|
||||
|
||||
Args:
|
||||
json_str: JSON格式的字符串
|
||||
|
||||
Returns:
|
||||
List[str]: 关键词列表
|
||||
"""
|
||||
try:
|
||||
# 使用repair_json修复JSON格式
|
||||
fixed_json = repair_json(json_str)
|
||||
|
||||
# 如果repair_json返回的是字符串,需要解析为Python对象
|
||||
result = json.loads(fixed_json) if isinstance(fixed_json, str) else fixed_json
|
||||
return result.get("keywords", [])
|
||||
except Exception as e:
|
||||
logger.error(f"解析关键词JSON失败: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def init_prompt():
|
||||
# --- Group Chat Prompt ---
|
||||
memory_activator_prompt = """
|
||||
你是一个记忆分析器,你需要根据以下信息来进行回忆
|
||||
以下是一段聊天记录,请根据这些信息,总结出几个关键词作为记忆回忆的触发词
|
||||
|
||||
聊天记录:
|
||||
{obs_info_text}
|
||||
你想要回复的消息:
|
||||
{target_message}
|
||||
|
||||
历史关键词(请避免重复提取这些关键词):
|
||||
{cached_keywords}
|
||||
|
||||
请输出一个json格式,包含以下字段:
|
||||
{{
|
||||
"keywords": ["关键词1", "关键词2", "关键词3",......]
|
||||
}}
|
||||
不要输出其他多余内容,只输出json格式就好
|
||||
"""
|
||||
|
||||
Prompt(memory_activator_prompt, "memory_activator_prompt")
|
||||
|
||||
|
||||
class MemoryActivator:
|
||||
def __init__(self):
|
||||
self.key_words_model = LLMRequest(
|
||||
model_set=model_config.model_task_config.utils_small,
|
||||
request_type="memory.activator",
|
||||
)
|
||||
|
||||
self.running_memory = []
|
||||
self.cached_keywords = set() # 用于缓存历史关键词
|
||||
|
||||
async def activate_memory_with_chat_history(self, target_message, chat_history_prompt) -> List[Dict]:
|
||||
"""
|
||||
激活记忆
|
||||
"""
|
||||
# 如果记忆系统被禁用,直接返回空列表
|
||||
if not global_config.memory.enable_memory:
|
||||
return []
|
||||
|
||||
# 将缓存的关键词转换为字符串,用于prompt
|
||||
cached_keywords_str = ", ".join(self.cached_keywords) if self.cached_keywords else "暂无历史关键词"
|
||||
|
||||
prompt = await global_prompt_manager.format_prompt(
|
||||
"memory_activator_prompt",
|
||||
obs_info_text=chat_history_prompt,
|
||||
target_message=target_message,
|
||||
cached_keywords=cached_keywords_str,
|
||||
)
|
||||
|
||||
# logger.debug(f"prompt: {prompt}")
|
||||
|
||||
response, (reasoning_content, model_name, _) = await self.key_words_model.generate_response_async(
|
||||
prompt, temperature=0.5
|
||||
)
|
||||
|
||||
keywords = list(get_keywords_from_json(response))
|
||||
|
||||
# 更新关键词缓存
|
||||
if keywords:
|
||||
# 限制缓存大小,最多保留10个关键词
|
||||
if len(self.cached_keywords) > 10:
|
||||
# 转换为列表,移除最早的关键词
|
||||
cached_list = list(self.cached_keywords)
|
||||
self.cached_keywords = set(cached_list[-8:])
|
||||
|
||||
# 添加新的关键词到缓存
|
||||
self.cached_keywords.update(keywords)
|
||||
|
||||
# 调用记忆系统获取相关记忆
|
||||
related_memory = await hippocampus_manager.get_memory_from_topic(
|
||||
valid_keywords=keywords, max_memory_num=3, max_memory_length=2, max_depth=3
|
||||
)
|
||||
|
||||
logger.debug(f"当前记忆关键词: {self.cached_keywords} ")
|
||||
logger.debug(f"获取到的记忆: {related_memory}")
|
||||
|
||||
# 激活时,所有已有记忆的duration+1,达到3则移除
|
||||
for m in self.running_memory[:]:
|
||||
m["duration"] = m.get("duration", 1) + 1
|
||||
self.running_memory = [m for m in self.running_memory if m["duration"] < 3]
|
||||
|
||||
if related_memory:
|
||||
for topic, memory in related_memory:
|
||||
# 检查是否已存在相同topic或相似内容(相似度>=0.7)的记忆
|
||||
exists = any(
|
||||
m["topic"] == topic or difflib.SequenceMatcher(None, m["content"], memory).ratio() >= 0.7
|
||||
for m in self.running_memory
|
||||
)
|
||||
if not exists:
|
||||
self.running_memory.append(
|
||||
{"topic": topic, "content": memory, "timestamp": datetime.now().isoformat(), "duration": 1}
|
||||
)
|
||||
logger.debug(f"添加新记忆: {topic} - {memory}")
|
||||
|
||||
# 限制同时加载的记忆条数,最多保留最后3条
|
||||
if len(self.running_memory) > 3:
|
||||
self.running_memory = self.running_memory[-3:]
|
||||
|
||||
return self.running_memory
|
||||
|
||||
|
||||
init_prompt()
|
||||
126
src/chat/memory_system/sample_distribution.py
Normal file
126
src/chat/memory_system/sample_distribution.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
from rich.traceback import install
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
|
||||
class MemoryBuildScheduler:
|
||||
def __init__(self, n_hours1, std_hours1, weight1, n_hours2, std_hours2, weight2, total_samples=50):
|
||||
"""
|
||||
初始化记忆构建调度器
|
||||
|
||||
参数:
|
||||
n_hours1 (float): 第一个分布的均值(距离现在的小时数)
|
||||
std_hours1 (float): 第一个分布的标准差(小时)
|
||||
weight1 (float): 第一个分布的权重
|
||||
n_hours2 (float): 第二个分布的均值(距离现在的小时数)
|
||||
std_hours2 (float): 第二个分布的标准差(小时)
|
||||
weight2 (float): 第二个分布的权重
|
||||
total_samples (int): 要生成的总时间点数量
|
||||
"""
|
||||
# 验证参数
|
||||
if total_samples <= 0:
|
||||
raise ValueError("total_samples 必须大于0")
|
||||
if weight1 < 0 or weight2 < 0:
|
||||
raise ValueError("权重必须为非负数")
|
||||
if std_hours1 < 0 or std_hours2 < 0:
|
||||
raise ValueError("标准差必须为非负数")
|
||||
|
||||
# 归一化权重
|
||||
total_weight = weight1 + weight2
|
||||
if total_weight == 0:
|
||||
raise ValueError("权重总和不能为0")
|
||||
self.weight1 = weight1 / total_weight
|
||||
self.weight2 = weight2 / total_weight
|
||||
|
||||
self.n_hours1 = n_hours1
|
||||
self.std_hours1 = std_hours1
|
||||
self.n_hours2 = n_hours2
|
||||
self.std_hours2 = std_hours2
|
||||
self.total_samples = total_samples
|
||||
self.base_time = datetime.now()
|
||||
|
||||
def generate_time_samples(self):
|
||||
"""生成混合分布的时间采样点"""
|
||||
# 根据权重计算每个分布的样本数
|
||||
samples1 = max(1, int(self.total_samples * self.weight1))
|
||||
samples2 = max(1, self.total_samples - samples1) # 确保 samples2 至少为1
|
||||
|
||||
# 生成两个正态分布的小时偏移
|
||||
hours_offset1 = np.random.normal(loc=self.n_hours1, scale=self.std_hours1, size=samples1)
|
||||
hours_offset2 = np.random.normal(loc=self.n_hours2, scale=self.std_hours2, size=samples2)
|
||||
|
||||
# 合并两个分布的偏移
|
||||
hours_offset = np.concatenate([hours_offset1, hours_offset2])
|
||||
|
||||
# 将偏移转换为实际时间戳(使用绝对值确保时间点在过去)
|
||||
timestamps = [self.base_time - timedelta(hours=abs(offset)) for offset in hours_offset]
|
||||
|
||||
# 按时间排序(从最早到最近)
|
||||
return sorted(timestamps)
|
||||
|
||||
def get_timestamp_array(self):
|
||||
"""返回时间戳数组"""
|
||||
timestamps = self.generate_time_samples()
|
||||
return [int(t.timestamp()) for t in timestamps]
|
||||
|
||||
|
||||
# def print_time_samples(timestamps, show_distribution=True):
|
||||
# """打印时间样本和分布信息"""
|
||||
# print(f"\n生成的{len(timestamps)}个时间点分布:")
|
||||
# print("序号".ljust(5), "时间戳".ljust(25), "距现在(小时)")
|
||||
# print("-" * 50)
|
||||
|
||||
# now = datetime.now()
|
||||
# time_diffs = []
|
||||
|
||||
# for i, timestamp in enumerate(timestamps, 1):
|
||||
# hours_diff = (now - timestamp).total_seconds() / 3600
|
||||
# time_diffs.append(hours_diff)
|
||||
# print(f"{str(i).ljust(5)} {timestamp.strftime('%Y-%m-%d %H:%M:%S').ljust(25)} {hours_diff:.2f}")
|
||||
|
||||
# # 打印统计信息
|
||||
# print("\n统计信息:")
|
||||
# print(f"平均时间偏移:{np.mean(time_diffs):.2f}小时")
|
||||
# print(f"标准差:{np.std(time_diffs):.2f}小时")
|
||||
# print(f"最早时间:{min(timestamps).strftime('%Y-%m-%d %H:%M:%S')} ({max(time_diffs):.2f}小时前)")
|
||||
# print(f"最近时间:{max(timestamps).strftime('%Y-%m-%d %H:%M:%S')} ({min(time_diffs):.2f}小时前)")
|
||||
|
||||
# if show_distribution:
|
||||
# # 计算时间分布的直方图
|
||||
# hist, bins = np.histogram(time_diffs, bins=40)
|
||||
# print("\n时间分布(每个*代表一个时间点):")
|
||||
# for i in range(len(hist)):
|
||||
# if hist[i] > 0:
|
||||
# print(f"{bins[i]:6.1f}-{bins[i + 1]:6.1f}小时: {'*' * int(hist[i])}")
|
||||
|
||||
|
||||
# # 使用示例
|
||||
# if __name__ == "__main__":
|
||||
# # 创建一个双峰分布的记忆调度器
|
||||
# scheduler = MemoryBuildScheduler(
|
||||
# n_hours1=12, # 第一个分布均值(12小时前)
|
||||
# std_hours1=8, # 第一个分布标准差
|
||||
# weight1=0.7, # 第一个分布权重 70%
|
||||
# n_hours2=36, # 第二个分布均值(36小时前)
|
||||
# std_hours2=24, # 第二个分布标准差
|
||||
# weight2=0.3, # 第二个分布权重 30%
|
||||
# total_samples=50, # 总共生成50个时间点
|
||||
# )
|
||||
|
||||
# # 生成时间分布
|
||||
# timestamps = scheduler.generate_time_samples()
|
||||
|
||||
# # 打印结果,包含分布可视化
|
||||
# print_time_samples(timestamps, show_distribution=True)
|
||||
|
||||
# # 打印时间戳数组
|
||||
# timestamp_array = scheduler.get_timestamp_array()
|
||||
# print("\n时间戳数组(Unix时间戳):")
|
||||
# print("[", end="")
|
||||
# for i, ts in enumerate(timestamp_array):
|
||||
# if i > 0:
|
||||
# print(", ", end="")
|
||||
# print(ts, end="")
|
||||
# print("]")
|
||||
10
src/chat/message_receive/__init__.py
Normal file
10
src/chat/message_receive/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from src.chat.emoji_system.emoji_manager import get_emoji_manager
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.chat.message_receive.storage import MessageStorage
|
||||
|
||||
|
||||
__all__ = [
|
||||
"get_emoji_manager",
|
||||
"get_chat_manager",
|
||||
"MessageStorage",
|
||||
]
|
||||
288
src/chat/message_receive/bot.py
Normal file
288
src/chat/message_receive/bot.py
Normal file
@@ -0,0 +1,288 @@
|
||||
import traceback
|
||||
import os
|
||||
import re
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
from maim_message import UserInfo
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.config.config import global_config
|
||||
from src.mood.mood_manager import mood_manager # 导入情绪管理器
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager, ChatStream
|
||||
from src.chat.message_receive.message import MessageRecv, MessageRecvS4U
|
||||
from src.chat.message_receive.storage import MessageStorage
|
||||
from src.chat.heart_flow.heartflow_message_processor import HeartFCMessageReceiver
|
||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||
from src.plugin_system.core import component_registry, events_manager, global_announcement_manager
|
||||
from src.plugin_system.base import BaseCommand, EventType
|
||||
from src.mais4u.mais4u_chat.s4u_msg_processor import S4UMessageProcessor
|
||||
|
||||
# 定义日志配置
|
||||
|
||||
# 获取项目根目录(假设本文件在src/chat/message_receive/下,根目录为上上上级目录)
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
|
||||
|
||||
# 配置主程序日志格式
|
||||
logger = get_logger("chat")
|
||||
|
||||
|
||||
def _check_ban_words(text: str, chat: ChatStream, userinfo: UserInfo) -> bool:
|
||||
"""检查消息是否包含过滤词
|
||||
|
||||
Args:
|
||||
text: 待检查的文本
|
||||
chat: 聊天对象
|
||||
userinfo: 用户信息
|
||||
|
||||
Returns:
|
||||
bool: 是否包含过滤词
|
||||
"""
|
||||
for word in global_config.message_receive.ban_words:
|
||||
if word in text:
|
||||
chat_name = chat.group_info.group_name if chat.group_info else "私聊"
|
||||
logger.info(f"[{chat_name}]{userinfo.user_nickname}:{text}")
|
||||
logger.info(f"[过滤词识别]消息中含有{word},filtered")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _check_ban_regex(text: str, chat: ChatStream, userinfo: UserInfo) -> bool:
|
||||
"""检查消息是否匹配过滤正则表达式
|
||||
|
||||
Args:
|
||||
text: 待检查的文本
|
||||
chat: 聊天对象
|
||||
userinfo: 用户信息
|
||||
|
||||
Returns:
|
||||
bool: 是否匹配过滤正则
|
||||
"""
|
||||
for pattern in global_config.message_receive.ban_msgs_regex:
|
||||
if re.search(pattern, text):
|
||||
chat_name = chat.group_info.group_name if chat.group_info else "私聊"
|
||||
logger.info(f"[{chat_name}]{userinfo.user_nickname}:{text}")
|
||||
logger.info(f"[正则表达式过滤]消息匹配到{pattern},filtered")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class ChatBot:
|
||||
def __init__(self):
|
||||
self.bot = None # bot 实例引用
|
||||
self._started = False
|
||||
self.mood_manager = mood_manager # 获取情绪管理器单例
|
||||
self.heartflow_message_receiver = HeartFCMessageReceiver() # 新增
|
||||
|
||||
self.s4u_message_processor = S4UMessageProcessor()
|
||||
|
||||
async def _ensure_started(self):
|
||||
"""确保所有任务已启动"""
|
||||
if not self._started:
|
||||
logger.debug("确保ChatBot所有任务已启动")
|
||||
|
||||
self._started = True
|
||||
|
||||
async def _process_commands_with_new_system(self, message: MessageRecv):
|
||||
# sourcery skip: use-named-expression
|
||||
"""使用新插件系统处理命令"""
|
||||
try:
|
||||
text = message.processed_plain_text
|
||||
|
||||
# 使用新的组件注册中心查找命令
|
||||
command_result = component_registry.find_command_by_text(text)
|
||||
if command_result:
|
||||
command_class, matched_groups, command_info = command_result
|
||||
plugin_name = command_info.plugin_name
|
||||
command_name = command_info.name
|
||||
if (
|
||||
message.chat_stream
|
||||
and message.chat_stream.stream_id
|
||||
and command_name
|
||||
in global_announcement_manager.get_disabled_chat_commands(message.chat_stream.stream_id)
|
||||
):
|
||||
logger.info("用户禁用的命令,跳过处理")
|
||||
return False, None, True
|
||||
|
||||
message.is_command = True
|
||||
|
||||
# 获取插件配置
|
||||
plugin_config = component_registry.get_plugin_config(plugin_name)
|
||||
|
||||
# 创建命令实例
|
||||
command_instance: BaseCommand = command_class(message, plugin_config)
|
||||
command_instance.set_matched_groups(matched_groups)
|
||||
|
||||
try:
|
||||
# 执行命令
|
||||
success, response, intercept_message = await command_instance.execute()
|
||||
|
||||
# 记录命令执行结果
|
||||
if success:
|
||||
logger.info(f"命令执行成功: {command_class.__name__} (拦截: {intercept_message})")
|
||||
else:
|
||||
logger.warning(f"命令执行失败: {command_class.__name__} - {response}")
|
||||
|
||||
# 根据命令的拦截设置决定是否继续处理消息
|
||||
return True, response, not intercept_message # 找到命令,根据intercept_message决定是否继续
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"执行命令时出错: {command_class.__name__} - {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
try:
|
||||
await command_instance.send_text(f"命令执行出错: {str(e)}")
|
||||
except Exception as send_error:
|
||||
logger.error(f"发送错误消息失败: {send_error}")
|
||||
|
||||
# 命令出错时,根据命令的拦截设置决定是否继续处理消息
|
||||
return True, str(e), False # 出错时继续处理消息
|
||||
|
||||
# 没有找到命令,继续处理消息
|
||||
return False, None, True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理命令时出错: {e}")
|
||||
return False, None, True # 出错时继续处理消息
|
||||
|
||||
async def hanle_notice_message(self, message: MessageRecv):
|
||||
if message.message_info.message_id == "notice":
|
||||
message.is_notify = True
|
||||
logger.info("notice消息")
|
||||
# print(message)
|
||||
|
||||
return True
|
||||
|
||||
async def do_s4u(self, message_data: Dict[str, Any]):
|
||||
message = MessageRecvS4U(message_data)
|
||||
group_info = message.message_info.group_info
|
||||
user_info = message.message_info.user_info
|
||||
|
||||
get_chat_manager().register_message(message)
|
||||
chat = await get_chat_manager().get_or_create_stream(
|
||||
platform=message.message_info.platform, # type: ignore
|
||||
user_info=user_info, # type: ignore
|
||||
group_info=group_info,
|
||||
)
|
||||
|
||||
message.update_chat_stream(chat)
|
||||
|
||||
# 处理消息内容
|
||||
await message.process()
|
||||
|
||||
await self.s4u_message_processor.process_message(message)
|
||||
|
||||
return
|
||||
|
||||
async def message_process(self, message_data: Dict[str, Any]) -> None:
|
||||
"""处理转化后的统一格式消息
|
||||
这个函数本质是预处理一些数据,根据配置信息和消息内容,预处理消息,并分发到合适的消息处理器中
|
||||
heart_flow模式:使用思维流系统进行回复
|
||||
- 包含思维流状态管理
|
||||
- 在回复前进行观察和状态更新
|
||||
- 回复后更新思维流状态
|
||||
- 消息过滤
|
||||
- 记忆激活
|
||||
- 意愿计算
|
||||
- 消息生成和发送
|
||||
- 表情包处理
|
||||
- 性能计时
|
||||
"""
|
||||
try:
|
||||
# 确保所有任务已启动
|
||||
await self._ensure_started()
|
||||
|
||||
platform = message_data["message_info"].get("platform")
|
||||
|
||||
if platform == "amaidesu_default":
|
||||
await self.do_s4u(message_data)
|
||||
return
|
||||
|
||||
if message_data["message_info"].get("group_info") is not None:
|
||||
message_data["message_info"]["group_info"]["group_id"] = str(
|
||||
message_data["message_info"]["group_info"]["group_id"]
|
||||
)
|
||||
if message_data["message_info"].get("user_info") is not None:
|
||||
message_data["message_info"]["user_info"]["user_id"] = str(
|
||||
message_data["message_info"]["user_info"]["user_id"]
|
||||
)
|
||||
# print(message_data)
|
||||
# logger.debug(str(message_data))
|
||||
message = MessageRecv(message_data)
|
||||
|
||||
if await self.hanle_notice_message(message):
|
||||
# return
|
||||
pass
|
||||
|
||||
group_info = message.message_info.group_info
|
||||
user_info = message.message_info.user_info
|
||||
if message.message_info.additional_config:
|
||||
sent_message = message.message_info.additional_config.get("echo", False)
|
||||
if sent_message: # 这一段只是为了在一切处理前劫持上报的自身消息,用于更新message_id,需要ada支持上报事件,实际测试中不会对正常使用造成任何问题
|
||||
await MessageStorage.update_message(message)
|
||||
return
|
||||
|
||||
get_chat_manager().register_message(message)
|
||||
|
||||
chat = await get_chat_manager().get_or_create_stream(
|
||||
platform=message.message_info.platform, # type: ignore
|
||||
user_info=user_info, # type: ignore
|
||||
group_info=group_info,
|
||||
)
|
||||
|
||||
message.update_chat_stream(chat)
|
||||
|
||||
# 处理消息内容,生成纯文本
|
||||
await message.process()
|
||||
|
||||
# if await self.check_ban_content(message):
|
||||
# logger.warning(f"检测到消息中含有违法,色情,暴力,反动,敏感内容,消息内容:{message.processed_plain_text},发送者:{message.message_info.user_info.user_nickname}")
|
||||
# return
|
||||
|
||||
# 过滤检查
|
||||
if _check_ban_words(message.processed_plain_text, chat, user_info) or _check_ban_regex( # type: ignore
|
||||
message.raw_message, # type: ignore
|
||||
chat,
|
||||
user_info, # type: ignore
|
||||
):
|
||||
return
|
||||
|
||||
# 命令处理 - 使用新插件系统检查并处理命令
|
||||
is_command, cmd_result, continue_process = await self._process_commands_with_new_system(message)
|
||||
|
||||
# 如果是命令且不需要继续处理,则直接返回
|
||||
if is_command and not continue_process:
|
||||
await MessageStorage.store_message(message, chat)
|
||||
logger.info(f"命令处理完成,跳过后续消息处理: {cmd_result}")
|
||||
return
|
||||
|
||||
if not await events_manager.handle_mai_events(EventType.ON_MESSAGE, message):
|
||||
return
|
||||
|
||||
# 确认从接口发来的message是否有自定义的prompt模板信息
|
||||
if message.message_info.template_info and not message.message_info.template_info.template_default:
|
||||
template_group_name: Optional[str] = message.message_info.template_info.template_name # type: ignore
|
||||
template_items = message.message_info.template_info.template_items
|
||||
async with global_prompt_manager.async_message_scope(template_group_name):
|
||||
if isinstance(template_items, dict):
|
||||
for k in template_items.keys():
|
||||
await Prompt.create_async(template_items[k], k)
|
||||
logger.debug(f"注册{template_items[k]},{k}")
|
||||
else:
|
||||
template_group_name = None
|
||||
|
||||
async def preprocess():
|
||||
await self.heartflow_message_receiver.process_message(message)
|
||||
|
||||
if template_group_name:
|
||||
async with global_prompt_manager.async_message_scope(template_group_name):
|
||||
await preprocess()
|
||||
else:
|
||||
await preprocess()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"预处理消息失败: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
# 创建全局ChatBot实例
|
||||
chat_bot = ChatBot()
|
||||
436
src/chat/message_receive/chat_stream.py
Normal file
436
src/chat/message_receive/chat_stream.py
Normal file
@@ -0,0 +1,436 @@
|
||||
import asyncio
|
||||
import hashlib
|
||||
import time
|
||||
import copy
|
||||
from typing import Dict, Optional, TYPE_CHECKING
|
||||
from rich.traceback import install
|
||||
from maim_message import GroupInfo, UserInfo
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.common.database.database import db
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
||||
from sqlalchemy.dialects.mysql import insert as mysql_insert
|
||||
from src.common.database.sqlalchemy_models import ChatStreams # 新增导入
|
||||
from src.common.database.sqlalchemy_database_api import get_session
|
||||
from src.config.config import global_config # 新增导入
|
||||
# 避免循环导入,使用TYPE_CHECKING进行类型提示
|
||||
if TYPE_CHECKING:
|
||||
from .message import MessageRecv
|
||||
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
|
||||
logger = get_logger("chat_stream")
|
||||
session = get_session()
|
||||
|
||||
class ChatMessageContext:
|
||||
"""聊天消息上下文,存储消息的上下文信息"""
|
||||
|
||||
def __init__(self, message: "MessageRecv"):
|
||||
self.message = message
|
||||
|
||||
def get_template_name(self) -> Optional[str]:
|
||||
"""获取模板名称"""
|
||||
if self.message.message_info.template_info and not self.message.message_info.template_info.template_default:
|
||||
return self.message.message_info.template_info.template_name # type: ignore
|
||||
return None
|
||||
|
||||
def get_last_message(self) -> "MessageRecv":
|
||||
"""获取最后一条消息"""
|
||||
return self.message
|
||||
|
||||
def check_types(self, types: list) -> bool:
|
||||
# sourcery skip: invert-any-all, use-any, use-next
|
||||
"""检查消息类型"""
|
||||
if not self.message.message_info.format_info.accept_format: # type: ignore
|
||||
return False
|
||||
for t in types:
|
||||
if t not in self.message.message_info.format_info.accept_format: # type: ignore
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_priority_mode(self) -> str:
|
||||
"""获取优先级模式"""
|
||||
return self.message.priority_mode
|
||||
|
||||
def get_priority_info(self) -> Optional[dict]:
|
||||
"""获取优先级信息"""
|
||||
if hasattr(self.message, "priority_info") and self.message.priority_info:
|
||||
return self.message.priority_info
|
||||
return None
|
||||
|
||||
|
||||
class ChatStream:
|
||||
"""聊天流对象,存储一个完整的聊天上下文"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stream_id: str,
|
||||
platform: str,
|
||||
user_info: UserInfo,
|
||||
group_info: Optional[GroupInfo] = None,
|
||||
data: Optional[dict] = None,
|
||||
):
|
||||
self.stream_id = stream_id
|
||||
self.platform = platform
|
||||
self.user_info = user_info
|
||||
self.group_info = group_info
|
||||
self.create_time = data.get("create_time", time.time()) if data else time.time()
|
||||
self.last_active_time = data.get("last_active_time", self.create_time) if data else self.create_time
|
||||
self.saved = False
|
||||
self.context: ChatMessageContext = None # type: ignore # 用于存储该聊天的上下文信息
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""转换为字典格式"""
|
||||
return {
|
||||
"stream_id": self.stream_id,
|
||||
"platform": self.platform,
|
||||
"user_info": self.user_info.to_dict() if self.user_info else None,
|
||||
"group_info": self.group_info.to_dict() if self.group_info else None,
|
||||
"create_time": self.create_time,
|
||||
"last_active_time": self.last_active_time,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "ChatStream":
|
||||
"""从字典创建实例"""
|
||||
user_info = UserInfo.from_dict(data.get("user_info", {})) if data.get("user_info") else None
|
||||
group_info = GroupInfo.from_dict(data.get("group_info", {})) if data.get("group_info") else None
|
||||
|
||||
return cls(
|
||||
stream_id=data["stream_id"],
|
||||
platform=data["platform"],
|
||||
user_info=user_info, # type: ignore
|
||||
group_info=group_info,
|
||||
data=data,
|
||||
)
|
||||
|
||||
def update_active_time(self):
|
||||
"""更新最后活跃时间"""
|
||||
self.last_active_time = time.time()
|
||||
self.saved = False
|
||||
|
||||
def set_context(self, message: "MessageRecv"):
|
||||
"""设置聊天消息上下文"""
|
||||
self.context = ChatMessageContext(message)
|
||||
|
||||
|
||||
class ChatManager:
|
||||
"""聊天管理器,管理所有聊天流"""
|
||||
|
||||
_instance = None
|
||||
_initialized = False
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if not self._initialized:
|
||||
self.streams: Dict[str, ChatStream] = {} # stream_id -> ChatStream
|
||||
self.last_messages: Dict[str, "MessageRecv"] = {} # stream_id -> last_message
|
||||
try:
|
||||
db.connect(reuse_if_open=True)
|
||||
# 确保 ChatStreams 表存在
|
||||
session.execute(text("CREATE TABLE IF NOT EXISTS chat_streams (stream_id TEXT PRIMARY KEY, platform TEXT, create_time REAL, last_active_time REAL, user_platform TEXT, user_id TEXT, user_nickname TEXT, user_cardname TEXT, group_platform TEXT, group_id TEXT, group_name TEXT)"))
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"数据库连接或 ChatStreams 表创建失败: {e}")
|
||||
|
||||
self._initialized = True
|
||||
# 在事件循环中启动初始化
|
||||
# asyncio.create_task(self._initialize())
|
||||
# # 启动自动保存任务
|
||||
# asyncio.create_task(self._auto_save_task())
|
||||
|
||||
async def _initialize(self):
|
||||
"""异步初始化"""
|
||||
try:
|
||||
await self.load_all_streams()
|
||||
logger.info(f"聊天管理器已启动,已加载 {len(self.streams)} 个聊天流")
|
||||
except Exception as e:
|
||||
logger.error(f"聊天管理器启动失败: {str(e)}")
|
||||
|
||||
async def _auto_save_task(self):
|
||||
"""定期自动保存所有聊天流"""
|
||||
while True:
|
||||
await asyncio.sleep(300) # 每5分钟保存一次
|
||||
try:
|
||||
await self._save_all_streams()
|
||||
logger.info("聊天流自动保存完成")
|
||||
except Exception as e:
|
||||
logger.error(f"聊天流自动保存失败: {str(e)}")
|
||||
|
||||
def register_message(self, message: "MessageRecv"):
|
||||
"""注册消息到聊天流"""
|
||||
stream_id = self._generate_stream_id(
|
||||
message.message_info.platform, # type: ignore
|
||||
message.message_info.user_info,
|
||||
message.message_info.group_info,
|
||||
)
|
||||
self.last_messages[stream_id] = message
|
||||
# logger.debug(f"注册消息到聊天流: {stream_id}")
|
||||
|
||||
@staticmethod
|
||||
def _generate_stream_id(
|
||||
platform: str, user_info: Optional[UserInfo], group_info: Optional[GroupInfo] = None
|
||||
) -> str:
|
||||
"""生成聊天流唯一ID"""
|
||||
if not user_info and not group_info:
|
||||
raise ValueError("用户信息或群组信息必须提供")
|
||||
|
||||
if group_info:
|
||||
# 组合关键信息
|
||||
components = [platform, str(group_info.group_id)]
|
||||
else:
|
||||
components = [platform, str(user_info.user_id), "private"] # type: ignore
|
||||
|
||||
# 使用MD5生成唯一ID
|
||||
key = "_".join(components)
|
||||
return hashlib.md5(key.encode()).hexdigest()
|
||||
|
||||
def get_stream_id(self, platform: str, id: str, is_group: bool = True) -> str:
|
||||
"""获取聊天流ID"""
|
||||
components = [platform, id] if is_group else [platform, id, "private"]
|
||||
key = "_".join(components)
|
||||
return hashlib.md5(key.encode()).hexdigest()
|
||||
|
||||
async def get_or_create_stream(
|
||||
self, platform: str, user_info: UserInfo, group_info: Optional[GroupInfo] = None
|
||||
) -> ChatStream:
|
||||
"""获取或创建聊天流
|
||||
|
||||
Args:
|
||||
platform: 平台标识
|
||||
user_info: 用户信息
|
||||
group_info: 群组信息(可选)
|
||||
|
||||
Returns:
|
||||
ChatStream: 聊天流对象
|
||||
"""
|
||||
# 生成stream_id
|
||||
try:
|
||||
stream_id = self._generate_stream_id(platform, user_info, group_info)
|
||||
|
||||
# 检查内存中是否存在
|
||||
if stream_id in self.streams:
|
||||
stream = self.streams[stream_id]
|
||||
|
||||
# 更新用户信息和群组信息
|
||||
stream.update_active_time()
|
||||
stream = copy.deepcopy(stream) # 返回副本以避免外部修改影响缓存
|
||||
if user_info.platform and user_info.user_id:
|
||||
stream.user_info = user_info
|
||||
if group_info:
|
||||
stream.group_info = group_info
|
||||
from .message import MessageRecv # 延迟导入,避免循环引用
|
||||
|
||||
if stream_id in self.last_messages and isinstance(self.last_messages[stream_id], MessageRecv):
|
||||
stream.set_context(self.last_messages[stream_id])
|
||||
else:
|
||||
logger.error(f"聊天流 {stream_id} 不在最后消息列表中,可能是新创建的")
|
||||
return stream
|
||||
|
||||
# 检查数据库中是否存在
|
||||
def _db_find_stream_sync(s_id: str):
|
||||
return session.execute(select(ChatStreams).where(ChatStreams.stream_id == s_id)).scalar()
|
||||
|
||||
model_instance = await asyncio.to_thread(_db_find_stream_sync, stream_id)
|
||||
|
||||
if model_instance:
|
||||
# 从 Peewee 模型转换回 ChatStream.from_dict 期望的格式
|
||||
user_info_data = {
|
||||
"platform": model_instance.user_platform,
|
||||
"user_id": model_instance.user_id,
|
||||
"user_nickname": model_instance.user_nickname,
|
||||
"user_cardname": model_instance.user_cardname or "",
|
||||
}
|
||||
group_info_data = None
|
||||
if model_instance.group_id: # 假设 group_id 为空字符串表示没有群组信息
|
||||
group_info_data = {
|
||||
"platform": model_instance.group_platform,
|
||||
"group_id": model_instance.group_id,
|
||||
"group_name": model_instance.group_name,
|
||||
}
|
||||
|
||||
data_for_from_dict = {
|
||||
"stream_id": model_instance.stream_id,
|
||||
"platform": model_instance.platform,
|
||||
"user_info": user_info_data,
|
||||
"group_info": group_info_data,
|
||||
"create_time": model_instance.create_time,
|
||||
"last_active_time": model_instance.last_active_time,
|
||||
}
|
||||
stream = ChatStream.from_dict(data_for_from_dict)
|
||||
# 更新用户信息和群组信息
|
||||
stream.user_info = user_info
|
||||
if group_info:
|
||||
stream.group_info = group_info
|
||||
stream.update_active_time()
|
||||
else:
|
||||
# 创建新的聊天流
|
||||
stream = ChatStream(
|
||||
stream_id=stream_id,
|
||||
platform=platform,
|
||||
user_info=user_info,
|
||||
group_info=group_info,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"获取或创建聊天流失败: {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
stream = copy.deepcopy(stream)
|
||||
from .message import MessageRecv # 延迟导入,避免循环引用
|
||||
|
||||
if stream_id in self.last_messages and isinstance(self.last_messages[stream_id], MessageRecv):
|
||||
stream.set_context(self.last_messages[stream_id])
|
||||
else:
|
||||
logger.error(f"聊天流 {stream_id} 不在最后消息列表中,可能是新创建的")
|
||||
# 保存到内存和数据库
|
||||
self.streams[stream_id] = stream
|
||||
await self._save_stream(stream)
|
||||
return stream
|
||||
|
||||
def get_stream(self, stream_id: str) -> Optional[ChatStream]:
|
||||
"""通过stream_id获取聊天流"""
|
||||
stream = self.streams.get(stream_id)
|
||||
if not stream:
|
||||
return None
|
||||
if stream_id in self.last_messages:
|
||||
stream.set_context(self.last_messages[stream_id])
|
||||
return stream
|
||||
|
||||
def get_stream_by_info(
|
||||
self, platform: str, user_info: UserInfo, group_info: Optional[GroupInfo] = None
|
||||
) -> Optional[ChatStream]:
|
||||
"""通过信息获取聊天流"""
|
||||
stream_id = self._generate_stream_id(platform, user_info, group_info)
|
||||
return self.streams.get(stream_id)
|
||||
|
||||
def get_stream_name(self, stream_id: str) -> Optional[str]:
|
||||
"""根据 stream_id 获取聊天流名称"""
|
||||
stream = self.get_stream(stream_id)
|
||||
if not stream:
|
||||
return None
|
||||
|
||||
if stream.group_info and stream.group_info.group_name:
|
||||
return stream.group_info.group_name
|
||||
elif stream.user_info and stream.user_info.user_nickname:
|
||||
return f"{stream.user_info.user_nickname}的私聊"
|
||||
else:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
async def _save_stream(stream: ChatStream):
|
||||
"""保存聊天流到数据库"""
|
||||
if stream.saved:
|
||||
return
|
||||
stream_data_dict = stream.to_dict()
|
||||
|
||||
def _db_save_stream_sync(s_data_dict: dict):
|
||||
user_info_d = s_data_dict.get("user_info")
|
||||
group_info_d = s_data_dict.get("group_info")
|
||||
|
||||
fields_to_save = {
|
||||
"platform": s_data_dict["platform"],
|
||||
"create_time": s_data_dict["create_time"],
|
||||
"last_active_time": s_data_dict["last_active_time"],
|
||||
"user_platform": user_info_d["platform"] if user_info_d else "",
|
||||
"user_id": user_info_d["user_id"] if user_info_d else "",
|
||||
"user_nickname": user_info_d["user_nickname"] if user_info_d else "",
|
||||
"user_cardname": user_info_d.get("user_cardname", "") if user_info_d else None,
|
||||
"group_platform": group_info_d["platform"] if group_info_d else "",
|
||||
"group_id": group_info_d["group_id"] if group_info_d else "",
|
||||
"group_name": group_info_d["group_name"] if group_info_d else "",
|
||||
}
|
||||
|
||||
# 根据数据库类型选择插入语句
|
||||
if global_config.database.database_type == "sqlite":
|
||||
stmt = sqlite_insert(ChatStreams).values(stream_id=s_data_dict["stream_id"], **fields_to_save)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=['stream_id'],
|
||||
set_=fields_to_save
|
||||
)
|
||||
elif global_config.database.database_type == "mysql":
|
||||
stmt = mysql_insert(ChatStreams).values(stream_id=s_data_dict["stream_id"], **fields_to_save)
|
||||
stmt = stmt.on_duplicate_key_update(
|
||||
**{key: value for key, value in fields_to_save.items() if key != "stream_id"}
|
||||
)
|
||||
else:
|
||||
# 默认使用通用插入,尝试SQLite语法
|
||||
stmt = sqlite_insert(ChatStreams).values(stream_id=s_data_dict["stream_id"], **fields_to_save)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=['stream_id'],
|
||||
set_=fields_to_save
|
||||
)
|
||||
|
||||
session.execute(stmt)
|
||||
session.commit()
|
||||
|
||||
try:
|
||||
await asyncio.to_thread(_db_save_stream_sync, stream_data_dict)
|
||||
stream.saved = True
|
||||
except Exception as e:
|
||||
logger.error(f"保存聊天流 {stream.stream_id} 到数据库失败 (Peewee): {e}", exc_info=True)
|
||||
|
||||
async def _save_all_streams(self):
|
||||
"""保存所有聊天流"""
|
||||
for stream in self.streams.values():
|
||||
await self._save_stream(stream)
|
||||
|
||||
async def load_all_streams(self):
|
||||
"""从数据库加载所有聊天流"""
|
||||
logger.info("正在从数据库加载所有聊天流")
|
||||
|
||||
def _db_load_all_streams_sync():
|
||||
loaded_streams_data = []
|
||||
for model_instance in session.execute(select(ChatStreams)).scalars():
|
||||
user_info_data = {
|
||||
"platform": model_instance.user_platform,
|
||||
"user_id": model_instance.user_id,
|
||||
"user_nickname": model_instance.user_nickname,
|
||||
"user_cardname": model_instance.user_cardname or "",
|
||||
}
|
||||
group_info_data = None
|
||||
if model_instance.group_id:
|
||||
group_info_data = {
|
||||
"platform": model_instance.group_platform,
|
||||
"group_id": model_instance.group_id,
|
||||
"group_name": model_instance.group_name,
|
||||
}
|
||||
|
||||
data_for_from_dict = {
|
||||
"stream_id": model_instance.stream_id,
|
||||
"platform": model_instance.platform,
|
||||
"user_info": user_info_data,
|
||||
"group_info": group_info_data,
|
||||
"create_time": model_instance.create_time,
|
||||
"last_active_time": model_instance.last_active_time,
|
||||
}
|
||||
loaded_streams_data.append(data_for_from_dict)
|
||||
return loaded_streams_data
|
||||
|
||||
try:
|
||||
all_streams_data_list = await asyncio.to_thread(_db_load_all_streams_sync)
|
||||
self.streams.clear()
|
||||
for data in all_streams_data_list:
|
||||
stream = ChatStream.from_dict(data)
|
||||
stream.saved = True
|
||||
self.streams[stream.stream_id] = stream
|
||||
if stream.stream_id in self.last_messages:
|
||||
stream.set_context(self.last_messages[stream.stream_id])
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库加载所有聊天流失败 (Peewee): {e}", exc_info=True)
|
||||
|
||||
|
||||
chat_manager = None
|
||||
|
||||
|
||||
def get_chat_manager():
|
||||
global chat_manager
|
||||
if chat_manager is None:
|
||||
chat_manager = ChatManager()
|
||||
return chat_manager
|
||||
572
src/chat/message_receive/message.py
Normal file
572
src/chat/message_receive/message.py
Normal file
@@ -0,0 +1,572 @@
|
||||
import time
|
||||
import urllib3
|
||||
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from rich.traceback import install
|
||||
from typing import Optional, Any
|
||||
from maim_message import Seg, UserInfo, BaseMessageInfo, MessageBase
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.utils.utils_image import get_image_manager
|
||||
from src.chat.utils.utils_voice import get_voice_text
|
||||
from .chat_stream import ChatStream
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
logger = get_logger("chat_message")
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# 这个类是消息数据类,用于存储和管理消息数据。
|
||||
# 它定义了消息的属性,包括群组ID、用户ID、消息ID、原始消息内容、纯文本内容和时间戳。
|
||||
# 它还定义了两个辅助属性:keywords用于提取消息的关键词,is_plain_text用于判断消息是否为纯文本。
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message(MessageBase):
|
||||
chat_stream: "ChatStream" = None # type: ignore
|
||||
reply: Optional["Message"] = None
|
||||
processed_plain_text: str = ""
|
||||
memorized_times: int = 0
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message_id: str,
|
||||
chat_stream: "ChatStream",
|
||||
user_info: UserInfo,
|
||||
message_segment: Optional[Seg] = None,
|
||||
timestamp: Optional[float] = None,
|
||||
reply: Optional["MessageRecv"] = None,
|
||||
processed_plain_text: str = "",
|
||||
):
|
||||
# 使用传入的时间戳或当前时间
|
||||
current_timestamp = timestamp if timestamp is not None else round(time.time(), 3)
|
||||
# 构造基础消息信息
|
||||
message_info = BaseMessageInfo(
|
||||
platform=chat_stream.platform,
|
||||
message_id=message_id,
|
||||
time=current_timestamp,
|
||||
group_info=chat_stream.group_info,
|
||||
user_info=user_info,
|
||||
)
|
||||
|
||||
# 调用父类初始化
|
||||
super().__init__(message_info=message_info, message_segment=message_segment, raw_message=None) # type: ignore
|
||||
|
||||
self.chat_stream = chat_stream
|
||||
# 文本处理相关属性
|
||||
self.processed_plain_text = processed_plain_text
|
||||
|
||||
# 回复消息
|
||||
self.reply = reply
|
||||
|
||||
async def _process_message_segments(self, segment: Seg) -> str:
|
||||
# sourcery skip: remove-unnecessary-else, swap-if-else-branches
|
||||
"""递归处理消息段,转换为文字描述
|
||||
|
||||
Args:
|
||||
segment: 要处理的消息段
|
||||
|
||||
Returns:
|
||||
str: 处理后的文本
|
||||
"""
|
||||
if segment.type == "seglist":
|
||||
# 处理消息段列表
|
||||
segments_text = []
|
||||
for seg in segment.data:
|
||||
processed = await self._process_message_segments(seg) # type: ignore
|
||||
if processed:
|
||||
segments_text.append(processed)
|
||||
return " ".join(segments_text)
|
||||
else:
|
||||
# 处理单个消息段
|
||||
return await self._process_single_segment(segment) # type: ignore
|
||||
|
||||
@abstractmethod
|
||||
async def _process_single_segment(self, segment):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageRecv(Message):
|
||||
"""接收消息类,用于处理从MessageCQ序列化的消息"""
|
||||
|
||||
def __init__(self, message_dict: dict[str, Any]):
|
||||
"""从MessageCQ的字典初始化
|
||||
|
||||
Args:
|
||||
message_dict: MessageCQ序列化后的字典
|
||||
"""
|
||||
self.message_info = BaseMessageInfo.from_dict(message_dict.get("message_info", {}))
|
||||
self.message_segment = Seg.from_dict(message_dict.get("message_segment", {}))
|
||||
self.raw_message = message_dict.get("raw_message")
|
||||
self.processed_plain_text = message_dict.get("processed_plain_text", "")
|
||||
self.is_emoji = False
|
||||
self.has_emoji = False
|
||||
self.is_picid = False
|
||||
self.has_picid = False
|
||||
self.is_voice = False
|
||||
self.is_mentioned = None
|
||||
self.is_notify = False
|
||||
|
||||
self.is_command = False
|
||||
|
||||
self.priority_mode = "interest"
|
||||
self.priority_info = None
|
||||
self.interest_value: float = None # type: ignore
|
||||
|
||||
def update_chat_stream(self, chat_stream: "ChatStream"):
|
||||
self.chat_stream = chat_stream
|
||||
|
||||
async def process(self) -> None:
|
||||
"""处理消息内容,生成纯文本和详细文本
|
||||
|
||||
这个方法必须在创建实例后显式调用,因为它包含异步操作。
|
||||
"""
|
||||
self.processed_plain_text = await self._process_message_segments(self.message_segment)
|
||||
|
||||
async def _process_single_segment(self, segment: Seg) -> str:
|
||||
"""处理单个消息段
|
||||
|
||||
Args:
|
||||
segment: 消息段
|
||||
|
||||
Returns:
|
||||
str: 处理后的文本
|
||||
"""
|
||||
try:
|
||||
if segment.type == "text":
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
return segment.data # type: ignore
|
||||
elif segment.type == "image":
|
||||
# 如果是base64图片数据
|
||||
if isinstance(segment.data, str):
|
||||
self.has_picid = True
|
||||
self.is_picid = True
|
||||
self.is_emoji = False
|
||||
image_manager = get_image_manager()
|
||||
# print(f"segment.data: {segment.data}")
|
||||
_, processed_text = await image_manager.process_image(segment.data)
|
||||
return processed_text
|
||||
return "[发了一张图片,网卡了加载不出来]"
|
||||
elif segment.type == "emoji":
|
||||
self.has_emoji = True
|
||||
self.is_emoji = True
|
||||
self.is_picid = False
|
||||
self.is_voice = False
|
||||
if isinstance(segment.data, str):
|
||||
return await get_image_manager().get_emoji_description(segment.data)
|
||||
return "[发了一个表情包,网卡了加载不出来]"
|
||||
elif segment.type == "voice":
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
self.is_voice = True
|
||||
if isinstance(segment.data, str):
|
||||
return await get_voice_text(segment.data)
|
||||
return "[发了一段语音,网卡了加载不出来]"
|
||||
elif segment.type == "mention_bot":
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
self.is_voice = False
|
||||
self.is_mentioned = float(segment.data) # type: ignore
|
||||
return ""
|
||||
elif segment.type == "priority_info":
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
self.is_voice = False
|
||||
if isinstance(segment.data, dict):
|
||||
# 处理优先级信息
|
||||
self.priority_mode = "priority"
|
||||
self.priority_info = segment.data
|
||||
"""
|
||||
{
|
||||
'message_type': 'vip', # vip or normal
|
||||
'message_priority': 1.0, # 优先级,大为优先,float
|
||||
}
|
||||
"""
|
||||
return ""
|
||||
else:
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"处理消息段失败: {str(e)}, 类型: {segment.type}, 数据: {segment.data}")
|
||||
return f"[处理失败的{segment.type}消息]"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageRecvS4U(MessageRecv):
|
||||
def __init__(self, message_dict: dict[str, Any]):
|
||||
super().__init__(message_dict)
|
||||
self.is_gift = False
|
||||
self.is_fake_gift = False
|
||||
self.is_superchat = False
|
||||
self.gift_info = None
|
||||
self.gift_name = None
|
||||
self.gift_count: Optional[str] = None
|
||||
self.superchat_info = None
|
||||
self.superchat_price = None
|
||||
self.superchat_message_text = None
|
||||
self.is_screen = False
|
||||
self.is_internal = False
|
||||
self.voice_done = None
|
||||
|
||||
self.chat_info = None
|
||||
|
||||
async def process(self) -> None:
|
||||
self.processed_plain_text = await self._process_message_segments(self.message_segment)
|
||||
|
||||
async def _process_single_segment(self, segment: Seg) -> str:
|
||||
"""处理单个消息段
|
||||
|
||||
Args:
|
||||
segment: 消息段
|
||||
|
||||
Returns:
|
||||
str: 处理后的文本
|
||||
"""
|
||||
try:
|
||||
if segment.type == "text":
|
||||
self.is_voice = False
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
return segment.data # type: ignore
|
||||
elif segment.type == "image":
|
||||
self.is_voice = False
|
||||
# 如果是base64图片数据
|
||||
if isinstance(segment.data, str):
|
||||
self.has_picid = True
|
||||
self.is_picid = True
|
||||
self.is_emoji = False
|
||||
image_manager = get_image_manager()
|
||||
# print(f"segment.data: {segment.data}")
|
||||
_, processed_text = await image_manager.process_image(segment.data)
|
||||
return processed_text
|
||||
return "[发了一张图片,网卡了加载不出来]"
|
||||
elif segment.type == "emoji":
|
||||
self.has_emoji = True
|
||||
self.is_emoji = True
|
||||
self.is_picid = False
|
||||
if isinstance(segment.data, str):
|
||||
return await get_image_manager().get_emoji_description(segment.data)
|
||||
return "[发了一个表情包,网卡了加载不出来]"
|
||||
elif segment.type == "voice":
|
||||
self.has_picid = False
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
self.is_voice = True
|
||||
if isinstance(segment.data, str):
|
||||
return await get_voice_text(segment.data)
|
||||
return "[发了一段语音,网卡了加载不出来]"
|
||||
elif segment.type == "mention_bot":
|
||||
self.is_voice = False
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
self.is_mentioned = float(segment.data) # type: ignore
|
||||
return ""
|
||||
elif segment.type == "priority_info":
|
||||
self.is_voice = False
|
||||
self.is_picid = False
|
||||
self.is_emoji = False
|
||||
if isinstance(segment.data, dict):
|
||||
# 处理优先级信息
|
||||
self.priority_mode = "priority"
|
||||
self.priority_info = segment.data
|
||||
"""
|
||||
{
|
||||
'message_type': 'vip', # vip or normal
|
||||
'message_priority': 1.0, # 优先级,大为优先,float
|
||||
}
|
||||
"""
|
||||
return ""
|
||||
elif segment.type == "gift":
|
||||
self.is_voice = False
|
||||
self.is_gift = True
|
||||
# 解析gift_info,格式为"名称:数量"
|
||||
name, count = segment.data.split(":", 1) # type: ignore
|
||||
self.gift_info = segment.data
|
||||
self.gift_name = name.strip()
|
||||
self.gift_count = int(count.strip())
|
||||
return ""
|
||||
elif segment.type == "voice_done":
|
||||
msg_id = segment.data
|
||||
logger.info(f"voice_done: {msg_id}")
|
||||
self.voice_done = msg_id
|
||||
return ""
|
||||
elif segment.type == "superchat":
|
||||
self.is_superchat = True
|
||||
self.superchat_info = segment.data
|
||||
price, message_text = segment.data.split(":", 1) # type: ignore
|
||||
self.superchat_price = price.strip()
|
||||
self.superchat_message_text = message_text.strip()
|
||||
|
||||
self.processed_plain_text = str(self.superchat_message_text)
|
||||
self.processed_plain_text += (
|
||||
f"(注意:这是一条超级弹幕信息,价值{self.superchat_price}元,请你认真回复)"
|
||||
)
|
||||
|
||||
return self.processed_plain_text
|
||||
elif segment.type == "screen":
|
||||
self.is_screen = True
|
||||
self.screen_info = segment.data
|
||||
return "屏幕信息"
|
||||
else:
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"处理消息段失败: {str(e)}, 类型: {segment.type}, 数据: {segment.data}")
|
||||
return f"[处理失败的{segment.type}消息]"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageProcessBase(Message):
|
||||
"""消息处理基类,用于处理中和发送中的消息"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message_id: str,
|
||||
chat_stream: "ChatStream",
|
||||
bot_user_info: UserInfo,
|
||||
message_segment: Optional[Seg] = None,
|
||||
reply: Optional["MessageRecv"] = None,
|
||||
thinking_start_time: float = 0,
|
||||
timestamp: Optional[float] = None,
|
||||
):
|
||||
# 调用父类初始化,传递时间戳
|
||||
super().__init__(
|
||||
message_id=message_id,
|
||||
timestamp=timestamp,
|
||||
chat_stream=chat_stream,
|
||||
user_info=bot_user_info,
|
||||
message_segment=message_segment,
|
||||
reply=reply,
|
||||
)
|
||||
|
||||
# 处理状态相关属性
|
||||
self.thinking_start_time = thinking_start_time
|
||||
self.thinking_time = 0
|
||||
|
||||
def update_thinking_time(self) -> float:
|
||||
"""更新思考时间"""
|
||||
self.thinking_time = round(time.time() - self.thinking_start_time, 2)
|
||||
return self.thinking_time
|
||||
|
||||
async def _process_single_segment(self, seg: Seg) -> str | None:
|
||||
"""处理单个消息段
|
||||
|
||||
Args:
|
||||
seg: 要处理的消息段
|
||||
|
||||
Returns:
|
||||
str: 处理后的文本
|
||||
"""
|
||||
try:
|
||||
if seg.type == "text":
|
||||
return seg.data # type: ignore
|
||||
elif seg.type == "image":
|
||||
# 如果是base64图片数据
|
||||
if isinstance(seg.data, str):
|
||||
return await get_image_manager().get_image_description(seg.data)
|
||||
return "[图片,网卡了加载不出来]"
|
||||
elif seg.type == "emoji":
|
||||
if isinstance(seg.data, str):
|
||||
return await get_image_manager().get_emoji_tag(seg.data)
|
||||
return "[表情,网卡了加载不出来]"
|
||||
elif seg.type == "voice":
|
||||
if isinstance(seg.data, str):
|
||||
return await get_voice_text(seg.data)
|
||||
return "[发了一段语音,网卡了加载不出来]"
|
||||
elif seg.type == "at":
|
||||
return f"[@{seg.data}]"
|
||||
elif seg.type == "reply":
|
||||
if self.reply and hasattr(self.reply, "processed_plain_text"):
|
||||
# print(f"self.reply.processed_plain_text: {self.reply.processed_plain_text}")
|
||||
# print(f"reply: {self.reply}")
|
||||
return f"[回复<{self.reply.message_info.user_info.user_nickname}:{self.reply.message_info.user_info.user_id}> 的消息:{self.reply.processed_plain_text}]" # type: ignore
|
||||
return None
|
||||
else:
|
||||
return f"[{seg.type}:{str(seg.data)}]"
|
||||
except Exception as e:
|
||||
logger.error(f"处理消息段失败: {str(e)}, 类型: {seg.type}, 数据: {seg.data}")
|
||||
return f"[处理失败的{seg.type}消息]"
|
||||
|
||||
def _generate_detailed_text(self) -> str:
|
||||
"""生成详细文本,包含时间和用户信息"""
|
||||
# time_str = time.strftime("%m-%d %H:%M:%S", time.localtime(self.message_info.time))
|
||||
timestamp = self.message_info.time
|
||||
user_info = self.message_info.user_info
|
||||
|
||||
name = f"<{self.message_info.platform}:{user_info.user_id}:{user_info.user_nickname}:{user_info.user_cardname}>" # type: ignore
|
||||
return f"[{timestamp}],{name} 说:{self.processed_plain_text}\n"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageSending(MessageProcessBase):
|
||||
"""发送状态的消息类"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message_id: str,
|
||||
chat_stream: "ChatStream",
|
||||
bot_user_info: UserInfo,
|
||||
sender_info: UserInfo | None, # 用来记录发送者信息
|
||||
message_segment: Seg,
|
||||
display_message: str = "",
|
||||
reply: Optional["MessageRecv"] = None,
|
||||
is_head: bool = False,
|
||||
is_emoji: bool = False,
|
||||
thinking_start_time: float = 0,
|
||||
apply_set_reply_logic: bool = False,
|
||||
reply_to: Optional[str] = None,
|
||||
):
|
||||
# 调用父类初始化
|
||||
super().__init__(
|
||||
message_id=message_id,
|
||||
chat_stream=chat_stream,
|
||||
bot_user_info=bot_user_info,
|
||||
message_segment=message_segment,
|
||||
reply=reply,
|
||||
thinking_start_time=thinking_start_time,
|
||||
)
|
||||
|
||||
# 发送状态特有属性
|
||||
self.sender_info = sender_info
|
||||
self.reply_to_message_id = reply.message_info.message_id if reply else None
|
||||
self.is_head = is_head
|
||||
self.is_emoji = is_emoji
|
||||
self.apply_set_reply_logic = apply_set_reply_logic
|
||||
|
||||
self.reply_to = reply_to
|
||||
|
||||
# 用于显示发送内容与显示不一致的情况
|
||||
self.display_message = display_message
|
||||
|
||||
self.interest_value = 0.0
|
||||
|
||||
def build_reply(self):
|
||||
"""设置回复消息"""
|
||||
if self.reply:
|
||||
self.reply_to_message_id = self.reply.message_info.message_id
|
||||
self.message_segment = Seg(
|
||||
type="seglist",
|
||||
data=[
|
||||
Seg(type="reply", data=self.reply.message_info.message_id), # type: ignore
|
||||
self.message_segment,
|
||||
],
|
||||
)
|
||||
|
||||
async def process(self) -> None:
|
||||
"""处理消息内容,生成纯文本和详细文本"""
|
||||
if self.message_segment:
|
||||
self.processed_plain_text = await self._process_message_segments(self.message_segment)
|
||||
|
||||
def to_dict(self):
|
||||
ret = super().to_dict()
|
||||
ret["message_info"]["user_info"] = self.chat_stream.user_info.to_dict()
|
||||
return ret
|
||||
|
||||
def is_private_message(self) -> bool:
|
||||
"""判断是否为私聊消息"""
|
||||
return self.message_info.group_info is None or self.message_info.group_info.group_id is None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MessageSet:
|
||||
"""消息集合类,可以存储多个发送消息"""
|
||||
|
||||
def __init__(self, chat_stream: "ChatStream", message_id: str):
|
||||
self.chat_stream = chat_stream
|
||||
self.message_id = message_id
|
||||
self.messages: list[MessageSending] = []
|
||||
self.time = round(time.time(), 3) # 保留3位小数
|
||||
|
||||
def add_message(self, message: MessageSending) -> None:
|
||||
"""添加消息到集合"""
|
||||
if not isinstance(message, MessageSending):
|
||||
raise TypeError("MessageSet只能添加MessageSending类型的消息")
|
||||
self.messages.append(message)
|
||||
self.messages.sort(key=lambda x: x.message_info.time) # type: ignore
|
||||
|
||||
def get_message_by_index(self, index: int) -> Optional[MessageSending]:
|
||||
"""通过索引获取消息"""
|
||||
return self.messages[index] if 0 <= index < len(self.messages) else None
|
||||
|
||||
def get_message_by_time(self, target_time: float) -> Optional[MessageSending]:
|
||||
"""获取最接近指定时间的消息"""
|
||||
if not self.messages:
|
||||
return None
|
||||
|
||||
left, right = 0, len(self.messages) - 1
|
||||
while left < right:
|
||||
mid = (left + right) // 2
|
||||
if self.messages[mid].message_info.time < target_time: # type: ignore
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid
|
||||
|
||||
return self.messages[left]
|
||||
|
||||
def clear_messages(self) -> None:
|
||||
"""清空所有消息"""
|
||||
self.messages.clear()
|
||||
|
||||
def remove_message(self, message: MessageSending) -> bool:
|
||||
"""移除指定消息"""
|
||||
if message in self.messages:
|
||||
self.messages.remove(message)
|
||||
return True
|
||||
return False
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"MessageSet(id={self.message_id}, count={len(self.messages)})"
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.messages)
|
||||
|
||||
|
||||
def message_recv_from_dict(message_dict: dict) -> MessageRecv:
|
||||
return MessageRecv(message_dict)
|
||||
|
||||
|
||||
def message_from_db_dict(db_dict: dict) -> MessageRecv:
|
||||
"""从数据库字典创建MessageRecv实例"""
|
||||
# 转换扁平的数据库字典为嵌套结构
|
||||
message_info_dict = {
|
||||
"platform": db_dict.get("chat_info_platform"),
|
||||
"message_id": db_dict.get("message_id"),
|
||||
"time": db_dict.get("time"),
|
||||
"group_info": {
|
||||
"platform": db_dict.get("chat_info_group_platform"),
|
||||
"group_id": db_dict.get("chat_info_group_id"),
|
||||
"group_name": db_dict.get("chat_info_group_name"),
|
||||
},
|
||||
"user_info": {
|
||||
"platform": db_dict.get("user_platform"),
|
||||
"user_id": db_dict.get("user_id"),
|
||||
"user_nickname": db_dict.get("user_nickname"),
|
||||
"user_cardname": db_dict.get("user_cardname"),
|
||||
},
|
||||
}
|
||||
|
||||
processed_text = db_dict.get("processed_plain_text", "")
|
||||
|
||||
# 构建 MessageRecv 需要的字典
|
||||
recv_dict = {
|
||||
"message_info": message_info_dict,
|
||||
"message_segment": {"type": "text", "data": processed_text}, # 从纯文本重建消息段
|
||||
"raw_message": None, # 数据库中未存储原始消息
|
||||
"processed_plain_text": processed_text,
|
||||
}
|
||||
|
||||
# 创建 MessageRecv 实例
|
||||
msg = MessageRecv(recv_dict)
|
||||
|
||||
# 从数据库字典中填充其他可选字段
|
||||
msg.interest_value = db_dict.get("interest_value", 0.0)
|
||||
msg.is_mentioned = db_dict.get("is_mentioned")
|
||||
msg.priority_mode = db_dict.get("priority_mode", "interest")
|
||||
msg.priority_info = db_dict.get("priority_info")
|
||||
msg.is_emoji = db_dict.get("is_emoji", False)
|
||||
msg.is_picid = db_dict.get("is_picid", False)
|
||||
|
||||
return msg
|
||||
172
src/chat/message_receive/storage.py
Normal file
172
src/chat/message_receive/storage.py
Normal file
@@ -0,0 +1,172 @@
|
||||
import re
|
||||
import traceback
|
||||
import json
|
||||
from typing import Union
|
||||
|
||||
from src.common.database.sqlalchemy_models import Messages, Images
|
||||
from src.common.logger import get_logger
|
||||
from .chat_stream import ChatStream
|
||||
from .message import MessageSending, MessageRecv
|
||||
from src.common.database.sqlalchemy_database_api import get_session
|
||||
from sqlalchemy import select, update, desc
|
||||
|
||||
logger = get_logger("message_storage")
|
||||
|
||||
class MessageStorage:
|
||||
@staticmethod
|
||||
async def store_message(message: Union[MessageSending, MessageRecv], chat_stream: ChatStream) -> None:
|
||||
"""存储消息到数据库"""
|
||||
try:
|
||||
# 过滤敏感信息的正则模式
|
||||
pattern = r"<MainRule>.*?</MainRule>|<schedule>.*?</schedule>|<UserMessage>.*?</UserMessage>"
|
||||
|
||||
processed_plain_text = message.processed_plain_text
|
||||
|
||||
if processed_plain_text:
|
||||
processed_plain_text = MessageStorage.replace_image_descriptions(processed_plain_text)
|
||||
filtered_processed_plain_text = re.sub(pattern, "", processed_plain_text, flags=re.DOTALL)
|
||||
else:
|
||||
filtered_processed_plain_text = ""
|
||||
|
||||
if isinstance(message, MessageSending):
|
||||
display_message = message.display_message
|
||||
if display_message:
|
||||
filtered_display_message = re.sub(pattern, "", display_message, flags=re.DOTALL)
|
||||
else:
|
||||
filtered_display_message = ""
|
||||
interest_value = 0
|
||||
is_mentioned = False
|
||||
reply_to = message.reply_to
|
||||
priority_mode = ""
|
||||
priority_info = {}
|
||||
is_emoji = False
|
||||
is_picid = False
|
||||
is_notify = False
|
||||
is_command = False
|
||||
else:
|
||||
filtered_display_message = ""
|
||||
interest_value = message.interest_value
|
||||
is_mentioned = message.is_mentioned
|
||||
reply_to = ""
|
||||
priority_mode = message.priority_mode
|
||||
priority_info = message.priority_info
|
||||
is_emoji = message.is_emoji
|
||||
is_picid = message.is_picid
|
||||
is_notify = message.is_notify
|
||||
is_command = message.is_command
|
||||
|
||||
chat_info_dict = chat_stream.to_dict()
|
||||
user_info_dict = message.message_info.user_info.to_dict() # type: ignore
|
||||
|
||||
# message_id 现在是 TextField,直接使用字符串值
|
||||
msg_id = message.message_info.message_id
|
||||
|
||||
# 安全地获取 group_info, 如果为 None 则视为空字典
|
||||
group_info_from_chat = chat_info_dict.get("group_info") or {}
|
||||
# 安全地获取 user_info, 如果为 None 则视为空字典 (以防万一)
|
||||
user_info_from_chat = chat_info_dict.get("user_info") or {}
|
||||
|
||||
# 将priority_info字典序列化为JSON字符串,以便存储到数据库的Text字段
|
||||
priority_info_json = json.dumps(priority_info) if priority_info else None
|
||||
|
||||
# 获取数据库会话
|
||||
session = get_session()
|
||||
|
||||
new_message = Messages(
|
||||
message_id=msg_id,
|
||||
time=float(message.message_info.time),
|
||||
chat_id=chat_stream.stream_id,
|
||||
reply_to=reply_to,
|
||||
is_mentioned=is_mentioned,
|
||||
chat_info_stream_id=chat_info_dict.get("stream_id"),
|
||||
chat_info_platform=chat_info_dict.get("platform"),
|
||||
chat_info_user_platform=user_info_from_chat.get("platform"),
|
||||
chat_info_user_id=user_info_from_chat.get("user_id"),
|
||||
chat_info_user_nickname=user_info_from_chat.get("user_nickname"),
|
||||
chat_info_user_cardname=user_info_from_chat.get("user_cardname"),
|
||||
chat_info_group_platform=group_info_from_chat.get("platform"),
|
||||
chat_info_group_id=group_info_from_chat.get("group_id"),
|
||||
chat_info_group_name=group_info_from_chat.get("group_name"),
|
||||
chat_info_create_time=float(chat_info_dict.get("create_time", 0.0)),
|
||||
chat_info_last_active_time=float(chat_info_dict.get("last_active_time", 0.0)),
|
||||
user_platform=user_info_dict.get("platform"),
|
||||
user_id=user_info_dict.get("user_id"),
|
||||
user_nickname=user_info_dict.get("user_nickname"),
|
||||
user_cardname=user_info_dict.get("user_cardname"),
|
||||
processed_plain_text=filtered_processed_plain_text,
|
||||
display_message=filtered_display_message,
|
||||
memorized_times=message.memorized_times,
|
||||
interest_value=interest_value,
|
||||
priority_mode=priority_mode,
|
||||
priority_info=priority_info_json,
|
||||
is_emoji=is_emoji,
|
||||
is_picid=is_picid,
|
||||
is_notify=is_notify,
|
||||
is_command=is_command,
|
||||
)
|
||||
session.add(new_message)
|
||||
session.commit()
|
||||
except Exception:
|
||||
logger.exception("存储消息失败")
|
||||
logger.error(f"消息:{message}")
|
||||
traceback.print_exc()
|
||||
|
||||
@staticmethod
|
||||
async def update_message(message):
|
||||
"""更新消息ID"""
|
||||
try:
|
||||
mmc_message_id = message.message_info.message_id # 修复:正确访问message_id
|
||||
if message.message_segment.type == "text":
|
||||
qq_message_id = message.message_segment.data.get("id")
|
||||
elif message.message_segment.type == "reply":
|
||||
qq_message_id = message.message_segment.data.get("id")
|
||||
else:
|
||||
logger.info(f"更新消息ID错误,seg类型为{message.message_segment.type}")
|
||||
return
|
||||
if not qq_message_id:
|
||||
logger.info("消息不存在message_id,无法更新")
|
||||
return
|
||||
|
||||
# 使用上下文管理器确保session正确管理
|
||||
from src.common.database.sqlalchemy_models import get_db_session
|
||||
with get_db_session() as session:
|
||||
matched_message = session.execute(
|
||||
select(Messages).where(Messages.message_id == mmc_message_id).order_by(desc(Messages.time))
|
||||
).scalar()
|
||||
|
||||
if matched_message:
|
||||
session.execute(
|
||||
update(Messages).where(Messages.id == matched_message.id).values(message_id=qq_message_id)
|
||||
)
|
||||
# session.commit() 会在上下文管理器中自动调用
|
||||
logger.debug(f"更新消息ID成功: {matched_message.message_id} -> {qq_message_id}")
|
||||
else:
|
||||
logger.debug("未找到匹配的消息")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"更新消息ID失败: {e}")
|
||||
|
||||
@staticmethod
|
||||
def replace_image_descriptions(text: str) -> str:
|
||||
"""将[图片:描述]替换为[picid:image_id]"""
|
||||
# 先检查文本中是否有图片标记
|
||||
pattern = r"\[图片:([^\]]+)\]"
|
||||
matches = re.findall(pattern, text)
|
||||
|
||||
if not matches:
|
||||
logger.debug("文本中没有图片标记,直接返回原文本")
|
||||
return text
|
||||
|
||||
def replace_match(match):
|
||||
description = match.group(1).strip()
|
||||
try:
|
||||
from src.common.database.sqlalchemy_models import get_db_session
|
||||
with get_db_session() as session:
|
||||
image_record = session.execute(
|
||||
select(Images).where(Images.description == description).order_by(desc(Images.timestamp))
|
||||
).scalar()
|
||||
return f"[picid:{image_record.image_id}]" if image_record else match.group(0)
|
||||
except Exception:
|
||||
return match.group(0)
|
||||
|
||||
return re.sub(r"\[图片:([^\]]+)\]", replace_match, text)
|
||||
90
src/chat/message_receive/uni_message_sender.py
Normal file
90
src/chat/message_receive/uni_message_sender.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import asyncio
|
||||
import traceback
|
||||
|
||||
from rich.traceback import install
|
||||
|
||||
from src.common.message.api import get_global_api
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.message_receive.message import MessageSending
|
||||
from src.chat.message_receive.storage import MessageStorage
|
||||
from src.chat.utils.utils import truncate_message
|
||||
from src.chat.utils.utils import calculate_typing_time
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
logger = get_logger("sender")
|
||||
|
||||
|
||||
async def send_message(message: MessageSending, show_log=True) -> bool:
|
||||
"""合并后的消息发送函数,包含WS发送和日志记录"""
|
||||
message_preview = truncate_message(message.processed_plain_text, max_length=120)
|
||||
|
||||
try:
|
||||
# 直接调用API发送消息
|
||||
await get_global_api().send_message(message)
|
||||
if show_log:
|
||||
logger.info(f"已将消息 '{message_preview}' 发往平台'{message.message_info.platform}'")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"发送消息 '{message_preview}' 发往平台'{message.message_info.platform}' 失败: {str(e)}")
|
||||
traceback.print_exc()
|
||||
raise e # 重新抛出其他异常
|
||||
|
||||
|
||||
class HeartFCSender:
|
||||
"""管理消息的注册、即时处理、发送和存储,并跟踪思考状态。"""
|
||||
|
||||
def __init__(self):
|
||||
self.storage = MessageStorage()
|
||||
|
||||
async def send_message(
|
||||
self, message: MessageSending, typing=False, set_reply=False, storage_message=True, show_log=True
|
||||
):
|
||||
"""
|
||||
处理、发送并存储一条消息。
|
||||
|
||||
参数:
|
||||
message: MessageSending 对象,待发送的消息。
|
||||
typing: 是否模拟打字等待。
|
||||
|
||||
用法:
|
||||
- typing=True 时,发送前会有打字等待。
|
||||
"""
|
||||
if not message.chat_stream:
|
||||
logger.error("消息缺少 chat_stream,无法发送")
|
||||
raise ValueError("消息缺少 chat_stream,无法发送")
|
||||
if not message.message_info or not message.message_info.message_id:
|
||||
logger.error("消息缺少 message_info 或 message_id,无法发送")
|
||||
raise ValueError("消息缺少 message_info 或 message_id,无法发送")
|
||||
|
||||
chat_id = message.chat_stream.stream_id
|
||||
message_id = message.message_info.message_id
|
||||
|
||||
try:
|
||||
if set_reply:
|
||||
message.build_reply()
|
||||
logger.debug(f"[{chat_id}] 选择回复引用消息: {message.processed_plain_text[:20]}...")
|
||||
|
||||
await message.process()
|
||||
|
||||
if typing:
|
||||
typing_time = calculate_typing_time(
|
||||
input_string=message.processed_plain_text,
|
||||
thinking_start_time=message.thinking_start_time,
|
||||
is_emoji=message.is_emoji,
|
||||
)
|
||||
await asyncio.sleep(typing_time)
|
||||
|
||||
sent_msg = await send_message(message, show_log=show_log)
|
||||
if not sent_msg:
|
||||
return False
|
||||
|
||||
if storage_message:
|
||||
await self.storage.store_message(message, message.chat_stream)
|
||||
|
||||
return sent_msg
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[{chat_id}] 处理或存储消息 {message_id} 时出错: {e}")
|
||||
raise e
|
||||
126
src/chat/planner_actions/action_manager.py
Normal file
126
src/chat/planner_actions/action_manager.py
Normal file
@@ -0,0 +1,126 @@
|
||||
from typing import Dict, Optional, Type
|
||||
|
||||
from src.chat.message_receive.chat_stream import ChatStream
|
||||
from src.common.logger import get_logger
|
||||
from src.plugin_system.core.component_registry import component_registry
|
||||
from src.plugin_system.base.component_types import ComponentType, ActionInfo
|
||||
from src.plugin_system.base.base_action import BaseAction
|
||||
|
||||
logger = get_logger("action_manager")
|
||||
|
||||
|
||||
class ActionManager:
|
||||
"""
|
||||
动作管理器,用于管理各种类型的动作
|
||||
|
||||
现在统一使用新插件系统,简化了原有的新旧兼容逻辑。
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化动作管理器"""
|
||||
|
||||
# 当前正在使用的动作集合,默认加载默认动作
|
||||
self._using_actions: Dict[str, ActionInfo] = {}
|
||||
|
||||
# 初始化时将默认动作加载到使用中的动作
|
||||
self._using_actions = component_registry.get_default_actions()
|
||||
|
||||
# === 执行Action方法 ===
|
||||
|
||||
def create_action(
|
||||
self,
|
||||
action_name: str,
|
||||
action_data: dict,
|
||||
reasoning: str,
|
||||
cycle_timers: dict,
|
||||
thinking_id: str,
|
||||
chat_stream: ChatStream,
|
||||
log_prefix: str,
|
||||
shutting_down: bool = False,
|
||||
action_message: Optional[dict] = None,
|
||||
) -> Optional[BaseAction]:
|
||||
"""
|
||||
创建动作处理器实例
|
||||
|
||||
Args:
|
||||
action_name: 动作名称
|
||||
action_data: 动作数据
|
||||
reasoning: 执行理由
|
||||
cycle_timers: 计时器字典
|
||||
thinking_id: 思考ID
|
||||
chat_stream: 聊天流
|
||||
log_prefix: 日志前缀
|
||||
shutting_down: 是否正在关闭
|
||||
|
||||
Returns:
|
||||
Optional[BaseAction]: 创建的动作处理器实例,如果动作名称未注册则返回None
|
||||
"""
|
||||
try:
|
||||
# 获取组件类 - 明确指定查询Action类型
|
||||
component_class: Type[BaseAction] = component_registry.get_component_class(
|
||||
action_name, ComponentType.ACTION
|
||||
) # type: ignore
|
||||
if not component_class:
|
||||
logger.warning(f"{log_prefix} 未找到Action组件: {action_name}")
|
||||
return None
|
||||
|
||||
# 获取组件信息
|
||||
component_info = component_registry.get_component_info(action_name, ComponentType.ACTION)
|
||||
if not component_info:
|
||||
logger.warning(f"{log_prefix} 未找到Action组件信息: {action_name}")
|
||||
return None
|
||||
|
||||
# 获取插件配置
|
||||
plugin_config = component_registry.get_plugin_config(component_info.plugin_name)
|
||||
|
||||
# 创建动作实例
|
||||
instance = component_class(
|
||||
action_data=action_data,
|
||||
reasoning=reasoning,
|
||||
cycle_timers=cycle_timers,
|
||||
thinking_id=thinking_id,
|
||||
chat_stream=chat_stream,
|
||||
log_prefix=log_prefix,
|
||||
shutting_down=shutting_down,
|
||||
plugin_config=plugin_config,
|
||||
action_message=action_message,
|
||||
)
|
||||
|
||||
logger.debug(f"创建Action实例成功: {action_name}")
|
||||
return instance
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"创建Action实例失败 {action_name}: {e}")
|
||||
import traceback
|
||||
|
||||
logger.error(traceback.format_exc())
|
||||
return None
|
||||
|
||||
def get_using_actions(self) -> Dict[str, ActionInfo]:
|
||||
"""获取当前正在使用的动作集合"""
|
||||
return self._using_actions.copy()
|
||||
|
||||
# === Modify相关方法 ===
|
||||
def remove_action_from_using(self, action_name: str) -> bool:
|
||||
"""
|
||||
从当前使用的动作集中移除指定动作
|
||||
|
||||
Args:
|
||||
action_name: 动作名称
|
||||
|
||||
Returns:
|
||||
bool: 移除是否成功
|
||||
"""
|
||||
if action_name not in self._using_actions:
|
||||
logger.warning(f"移除失败: 动作 {action_name} 不在当前使用的动作集中")
|
||||
return False
|
||||
|
||||
del self._using_actions[action_name]
|
||||
logger.debug(f"已从使用集中移除动作 {action_name}")
|
||||
return True
|
||||
|
||||
def restore_actions(self) -> None:
|
||||
"""恢复到默认动作集"""
|
||||
actions_to_restore = list(self._using_actions.keys())
|
||||
self._using_actions = component_registry.get_default_actions()
|
||||
logger.debug(f"恢复动作集: 从 {actions_to_restore} 恢复到默认动作集 {list(self._using_actions.keys())}")
|
||||
438
src/chat/planner_actions/action_modifier.py
Normal file
438
src/chat/planner_actions/action_modifier.py
Normal file
@@ -0,0 +1,438 @@
|
||||
import random
|
||||
import asyncio
|
||||
import hashlib
|
||||
import time
|
||||
from typing import List, Any, Dict, TYPE_CHECKING, Tuple
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.config.config import global_config, model_config
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager, ChatMessageContext
|
||||
from src.chat.planner_actions.action_manager import ActionManager
|
||||
from src.chat.utils.chat_message_builder import get_raw_msg_before_timestamp_with_chat, build_readable_messages
|
||||
from src.plugin_system.base.component_types import ActionInfo, ActionActivationType
|
||||
from src.plugin_system.core.global_announcement_manager import global_announcement_manager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from src.chat.message_receive.chat_stream import ChatStream
|
||||
|
||||
logger = get_logger("action_manager")
|
||||
|
||||
|
||||
class ActionModifier:
|
||||
"""动作处理器
|
||||
|
||||
用于处理Observation对象和根据激活类型处理actions。
|
||||
集成了原有的modify_actions功能和新的激活类型处理功能。
|
||||
支持并行判定和智能缓存优化。
|
||||
"""
|
||||
|
||||
def __init__(self, action_manager: ActionManager, chat_id: str):
|
||||
"""初始化动作处理器"""
|
||||
self.chat_id = chat_id
|
||||
self.chat_stream: ChatStream = get_chat_manager().get_stream(self.chat_id) # type: ignore
|
||||
self.log_prefix = f"[{get_chat_manager().get_stream_name(self.chat_id) or self.chat_id}]"
|
||||
|
||||
self.action_manager = action_manager
|
||||
|
||||
# 用于LLM判定的小模型
|
||||
self.llm_judge = LLMRequest(model_set=model_config.model_task_config.utils_small, request_type="action.judge")
|
||||
|
||||
# 缓存相关属性
|
||||
self._llm_judge_cache = {} # 缓存LLM判定结果
|
||||
self._cache_expiry_time = 30 # 缓存过期时间(秒)
|
||||
self._last_context_hash = None # 上次上下文的哈希值
|
||||
|
||||
async def modify_actions(
|
||||
self,
|
||||
message_content: str = "",
|
||||
): # sourcery skip: use-named-expression
|
||||
"""
|
||||
动作修改流程,整合传统观察处理和新的激活类型判定
|
||||
|
||||
这个方法处理完整的动作管理流程:
|
||||
1. 基于观察的传统动作修改(循环历史分析、类型匹配等)
|
||||
2. 基于激活类型的智能动作判定,最终确定可用动作集
|
||||
|
||||
处理后,ActionManager 将包含最终的可用动作集,供规划器直接使用
|
||||
"""
|
||||
logger.debug(f"{self.log_prefix}开始完整动作修改流程")
|
||||
|
||||
removals_s1: List[Tuple[str, str]] = []
|
||||
removals_s2: List[Tuple[str, str]] = []
|
||||
removals_s3: List[Tuple[str, str]] = []
|
||||
|
||||
self.action_manager.restore_actions()
|
||||
all_actions = self.action_manager.get_using_actions()
|
||||
|
||||
message_list_before_now_half = get_raw_msg_before_timestamp_with_chat(
|
||||
chat_id=self.chat_stream.stream_id,
|
||||
timestamp=time.time(),
|
||||
limit=min(int(global_config.chat.max_context_size * 0.33), 10),
|
||||
)
|
||||
chat_content = build_readable_messages(
|
||||
message_list_before_now_half,
|
||||
replace_bot_name=True,
|
||||
merge_messages=False,
|
||||
timestamp_mode="relative",
|
||||
read_mark=0.0,
|
||||
show_actions=True,
|
||||
)
|
||||
|
||||
if message_content:
|
||||
chat_content = chat_content + "\n" + f"现在,最新的消息是:{message_content}"
|
||||
|
||||
# === 第一阶段:去除用户自行禁用的 ===
|
||||
disabled_actions = global_announcement_manager.get_disabled_chat_actions(self.chat_id)
|
||||
if disabled_actions:
|
||||
for disabled_action_name in disabled_actions:
|
||||
if disabled_action_name in all_actions:
|
||||
removals_s1.append((disabled_action_name, "用户自行禁用"))
|
||||
self.action_manager.remove_action_from_using(disabled_action_name)
|
||||
logger.debug(f"{self.log_prefix}阶段一移除动作: {disabled_action_name},原因: 用户自行禁用")
|
||||
|
||||
# === 第二阶段:检查动作的关联类型 ===
|
||||
chat_context = self.chat_stream.context
|
||||
type_mismatched_actions = self._check_action_associated_types(all_actions, chat_context)
|
||||
|
||||
if type_mismatched_actions:
|
||||
removals_s2.extend(type_mismatched_actions)
|
||||
|
||||
# 应用第二阶段的移除
|
||||
for action_name, reason in removals_s2:
|
||||
self.action_manager.remove_action_from_using(action_name)
|
||||
logger.debug(f"{self.log_prefix}阶段二移除动作: {action_name},原因: {reason}")
|
||||
|
||||
# === 第三阶段:激活类型判定 ===
|
||||
if chat_content is not None:
|
||||
logger.debug(f"{self.log_prefix}开始激活类型判定阶段")
|
||||
|
||||
# 获取当前使用的动作集(经过第一阶段处理)
|
||||
current_using_actions = self.action_manager.get_using_actions()
|
||||
|
||||
# 获取因激活类型判定而需要移除的动作
|
||||
removals_s3 = await self._get_deactivated_actions_by_type(
|
||||
current_using_actions,
|
||||
chat_content,
|
||||
)
|
||||
|
||||
# 应用第三阶段的移除
|
||||
for action_name, reason in removals_s3:
|
||||
self.action_manager.remove_action_from_using(action_name)
|
||||
logger.debug(f"{self.log_prefix}阶段三移除动作: {action_name},原因: {reason}")
|
||||
|
||||
# === 统一日志记录 ===
|
||||
all_removals = removals_s1 + removals_s2 + removals_s3
|
||||
removals_summary: str = ""
|
||||
if all_removals:
|
||||
removals_summary = " | ".join([f"{name}({reason})" for name, reason in all_removals])
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} 动作修改流程结束,最终可用动作: {list(self.action_manager.get_using_actions().keys())}||移除记录: {removals_summary}"
|
||||
)
|
||||
|
||||
def _check_action_associated_types(self, all_actions: Dict[str, ActionInfo], chat_context: ChatMessageContext):
|
||||
type_mismatched_actions: List[Tuple[str, str]] = []
|
||||
for action_name, action_info in all_actions.items():
|
||||
if action_info.associated_types and not chat_context.check_types(action_info.associated_types):
|
||||
associated_types_str = ", ".join(action_info.associated_types)
|
||||
reason = f"适配器不支持(需要: {associated_types_str})"
|
||||
type_mismatched_actions.append((action_name, reason))
|
||||
logger.debug(f"{self.log_prefix}决定移除动作: {action_name},原因: {reason}")
|
||||
return type_mismatched_actions
|
||||
|
||||
async def _get_deactivated_actions_by_type(
|
||||
self,
|
||||
actions_with_info: Dict[str, ActionInfo],
|
||||
chat_content: str = "",
|
||||
) -> List[tuple[str, str]]:
|
||||
"""
|
||||
根据激活类型过滤,返回需要停用的动作列表及原因
|
||||
|
||||
Args:
|
||||
actions_with_info: 带完整信息的动作字典
|
||||
chat_content: 聊天内容
|
||||
|
||||
Returns:
|
||||
List[Tuple[str, str]]: 需要停用的 (action_name, reason) 元组列表
|
||||
"""
|
||||
deactivated_actions = []
|
||||
|
||||
# 分类处理不同激活类型的actions
|
||||
llm_judge_actions = {}
|
||||
|
||||
actions_to_check = list(actions_with_info.items())
|
||||
random.shuffle(actions_to_check)
|
||||
|
||||
for action_name, action_info in actions_to_check:
|
||||
activation_type = action_info.activation_type or action_info.focus_activation_type
|
||||
|
||||
if activation_type == ActionActivationType.ALWAYS:
|
||||
continue # 总是激活,无需处理
|
||||
|
||||
elif activation_type == ActionActivationType.RANDOM:
|
||||
probability = action_info.random_activation_probability
|
||||
if random.random() >= probability:
|
||||
reason = f"RANDOM类型未触发(概率{probability})"
|
||||
deactivated_actions.append((action_name, reason))
|
||||
logger.debug(f"{self.log_prefix}未激活动作: {action_name},原因: {reason}")
|
||||
|
||||
elif activation_type == ActionActivationType.KEYWORD:
|
||||
if not self._check_keyword_activation(action_name, action_info, chat_content):
|
||||
keywords = action_info.activation_keywords
|
||||
reason = f"关键词未匹配(关键词: {keywords})"
|
||||
deactivated_actions.append((action_name, reason))
|
||||
logger.debug(f"{self.log_prefix}未激活动作: {action_name},原因: {reason}")
|
||||
|
||||
elif activation_type == ActionActivationType.LLM_JUDGE:
|
||||
llm_judge_actions[action_name] = action_info
|
||||
|
||||
elif activation_type == ActionActivationType.NEVER:
|
||||
reason = "激活类型为never"
|
||||
deactivated_actions.append((action_name, reason))
|
||||
logger.debug(f"{self.log_prefix}未激活动作: {action_name},原因: 激活类型为never")
|
||||
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix}未知的激活类型: {activation_type},跳过处理")
|
||||
|
||||
# 并行处理LLM_JUDGE类型
|
||||
if llm_judge_actions:
|
||||
llm_results = await self._process_llm_judge_actions_parallel(
|
||||
llm_judge_actions,
|
||||
chat_content,
|
||||
)
|
||||
for action_name, should_activate in llm_results.items():
|
||||
if not should_activate:
|
||||
reason = "LLM判定未激活"
|
||||
deactivated_actions.append((action_name, reason))
|
||||
logger.debug(f"{self.log_prefix}未激活动作: {action_name},原因: {reason}")
|
||||
|
||||
return deactivated_actions
|
||||
|
||||
def _generate_context_hash(self, chat_content: str) -> str:
|
||||
"""生成上下文的哈希值用于缓存"""
|
||||
context_content = f"{chat_content}"
|
||||
return hashlib.md5(context_content.encode("utf-8")).hexdigest()
|
||||
|
||||
async def _process_llm_judge_actions_parallel(
|
||||
self,
|
||||
llm_judge_actions: Dict[str, Any],
|
||||
chat_content: str = "",
|
||||
) -> Dict[str, bool]:
|
||||
"""
|
||||
并行处理LLM判定actions,支持智能缓存
|
||||
|
||||
Args:
|
||||
llm_judge_actions: 需要LLM判定的actions
|
||||
chat_content: 聊天内容
|
||||
|
||||
Returns:
|
||||
Dict[str, bool]: action名称到激活结果的映射
|
||||
"""
|
||||
|
||||
# 生成当前上下文的哈希值
|
||||
current_context_hash = self._generate_context_hash(chat_content)
|
||||
current_time = time.time()
|
||||
|
||||
results = {}
|
||||
tasks_to_run = {}
|
||||
|
||||
# 检查缓存
|
||||
for action_name, action_info in llm_judge_actions.items():
|
||||
cache_key = f"{action_name}_{current_context_hash}"
|
||||
|
||||
# 检查是否有有效的缓存
|
||||
if (
|
||||
cache_key in self._llm_judge_cache
|
||||
and current_time - self._llm_judge_cache[cache_key]["timestamp"] < self._cache_expiry_time
|
||||
):
|
||||
results[action_name] = self._llm_judge_cache[cache_key]["result"]
|
||||
logger.debug(
|
||||
f"{self.log_prefix}使用缓存结果 {action_name}: {'激活' if results[action_name] else '未激活'}"
|
||||
)
|
||||
else:
|
||||
# 需要进行LLM判定
|
||||
tasks_to_run[action_name] = action_info
|
||||
|
||||
# 如果有需要运行的任务,并行执行
|
||||
if tasks_to_run:
|
||||
logger.debug(f"{self.log_prefix}并行执行LLM判定,任务数: {len(tasks_to_run)}")
|
||||
|
||||
# 创建并行任务
|
||||
tasks = []
|
||||
task_names = []
|
||||
|
||||
for action_name, action_info in tasks_to_run.items():
|
||||
task = self._llm_judge_action(
|
||||
action_name,
|
||||
action_info,
|
||||
chat_content,
|
||||
)
|
||||
tasks.append(task)
|
||||
task_names.append(action_name)
|
||||
|
||||
# 并行执行所有任务
|
||||
try:
|
||||
task_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# 处理结果并更新缓存
|
||||
for action_name, result in zip(task_names, task_results, strict=False):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"{self.log_prefix}LLM判定action {action_name} 时出错: {result}")
|
||||
results[action_name] = False
|
||||
else:
|
||||
results[action_name] = result
|
||||
|
||||
# 更新缓存
|
||||
cache_key = f"{action_name}_{current_context_hash}"
|
||||
self._llm_judge_cache[cache_key] = {"result": result, "timestamp": current_time}
|
||||
|
||||
logger.debug(f"{self.log_prefix}并行LLM判定完成,耗时: {time.time() - current_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix}并行LLM判定失败: {e}")
|
||||
# 如果并行执行失败,为所有任务返回False
|
||||
for action_name in tasks_to_run:
|
||||
results[action_name] = False
|
||||
|
||||
# 清理过期缓存
|
||||
self._cleanup_expired_cache(current_time)
|
||||
|
||||
return results
|
||||
|
||||
def _cleanup_expired_cache(self, current_time: float):
|
||||
"""清理过期的缓存条目"""
|
||||
expired_keys = []
|
||||
expired_keys.extend(
|
||||
cache_key
|
||||
for cache_key, cache_data in self._llm_judge_cache.items()
|
||||
if current_time - cache_data["timestamp"] > self._cache_expiry_time
|
||||
)
|
||||
for key in expired_keys:
|
||||
del self._llm_judge_cache[key]
|
||||
|
||||
if expired_keys:
|
||||
logger.debug(f"{self.log_prefix}清理了 {len(expired_keys)} 个过期缓存条目")
|
||||
|
||||
async def _llm_judge_action(
|
||||
self,
|
||||
action_name: str,
|
||||
action_info: ActionInfo,
|
||||
chat_content: str = "",
|
||||
) -> bool: # sourcery skip: move-assign-in-block, use-named-expression
|
||||
"""
|
||||
使用LLM判定是否应该激活某个action
|
||||
|
||||
Args:
|
||||
action_name: 动作名称
|
||||
action_info: 动作信息
|
||||
observed_messages_str: 观察到的聊天消息
|
||||
chat_context: 聊天上下文
|
||||
extra_context: 额外上下文
|
||||
|
||||
Returns:
|
||||
bool: 是否应该激活此action
|
||||
"""
|
||||
|
||||
try:
|
||||
# 构建判定提示词
|
||||
action_description = action_info.description
|
||||
action_require = action_info.action_require
|
||||
custom_prompt = action_info.llm_judge_prompt
|
||||
|
||||
# 构建基础判定提示词
|
||||
base_prompt = f"""
|
||||
你需要判断在当前聊天情况下,是否应该激活名为"{action_name}"的动作。
|
||||
|
||||
动作描述:{action_description}
|
||||
|
||||
动作使用场景:
|
||||
"""
|
||||
for req in action_require:
|
||||
base_prompt += f"- {req}\n"
|
||||
|
||||
if custom_prompt:
|
||||
base_prompt += f"\n额外判定条件:\n{custom_prompt}\n"
|
||||
|
||||
if chat_content:
|
||||
base_prompt += f"\n当前聊天记录:\n{chat_content}\n"
|
||||
|
||||
base_prompt += """
|
||||
请根据以上信息判断是否应该激活这个动作。
|
||||
只需要回答"是"或"否",不要有其他内容。
|
||||
"""
|
||||
|
||||
# 调用LLM进行判定
|
||||
response, _ = await self.llm_judge.generate_response_async(prompt=base_prompt)
|
||||
|
||||
# 解析响应
|
||||
response = response.strip().lower()
|
||||
|
||||
# print(base_prompt)
|
||||
# print(f"LLM判定动作 {action_name}:响应='{response}'")
|
||||
|
||||
should_activate = "是" in response or "yes" in response or "true" in response
|
||||
|
||||
logger.debug(
|
||||
f"{self.log_prefix}LLM判定动作 {action_name}:响应='{response}',结果={'激活' if should_activate else '不激活'}"
|
||||
)
|
||||
return should_activate
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix}LLM判定动作 {action_name} 时出错: {e}")
|
||||
# 出错时默认不激活
|
||||
return False
|
||||
|
||||
def _check_keyword_activation(
|
||||
self,
|
||||
action_name: str,
|
||||
action_info: ActionInfo,
|
||||
chat_content: str = "",
|
||||
) -> bool:
|
||||
"""
|
||||
检查是否匹配关键词触发条件
|
||||
|
||||
Args:
|
||||
action_name: 动作名称
|
||||
action_info: 动作信息
|
||||
observed_messages_str: 观察到的聊天消息
|
||||
chat_context: 聊天上下文
|
||||
extra_context: 额外上下文
|
||||
|
||||
Returns:
|
||||
bool: 是否应该激活此action
|
||||
"""
|
||||
|
||||
activation_keywords = action_info.activation_keywords
|
||||
case_sensitive = action_info.keyword_case_sensitive
|
||||
|
||||
if not activation_keywords:
|
||||
logger.warning(f"{self.log_prefix}动作 {action_name} 设置为关键词触发但未配置关键词")
|
||||
return False
|
||||
|
||||
# 构建检索文本
|
||||
search_text = ""
|
||||
if chat_content:
|
||||
search_text += chat_content
|
||||
# if chat_context:
|
||||
# search_text += f" {chat_context}"
|
||||
# if extra_context:
|
||||
# search_text += f" {extra_context}"
|
||||
|
||||
# 如果不区分大小写,转换为小写
|
||||
if not case_sensitive:
|
||||
search_text = search_text.lower()
|
||||
|
||||
# 检查每个关键词
|
||||
matched_keywords = []
|
||||
for keyword in activation_keywords:
|
||||
check_keyword = keyword if case_sensitive else keyword.lower()
|
||||
if check_keyword in search_text:
|
||||
matched_keywords.append(keyword)
|
||||
|
||||
if matched_keywords:
|
||||
logger.debug(f"{self.log_prefix}动作 {action_name} 匹配到关键词: {matched_keywords}")
|
||||
return True
|
||||
else:
|
||||
logger.debug(f"{self.log_prefix}动作 {action_name} 未匹配到任何关键词: {activation_keywords}")
|
||||
return False
|
||||
410
src/chat/planner_actions/planner.py
Normal file
410
src/chat/planner_actions/planner.py
Normal file
@@ -0,0 +1,410 @@
|
||||
import json
|
||||
import time
|
||||
import traceback
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from rich.traceback import install
|
||||
from datetime import datetime
|
||||
from json_repair import repair_json
|
||||
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import global_config, model_config
|
||||
from src.common.logger import get_logger
|
||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||
from src.chat.utils.chat_message_builder import (
|
||||
build_readable_actions,
|
||||
get_actions_by_timestamp_with_chat,
|
||||
build_readable_messages_with_id,
|
||||
get_raw_msg_before_timestamp_with_chat,
|
||||
)
|
||||
from src.chat.utils.utils import get_chat_type_and_target_info
|
||||
from src.chat.planner_actions.action_manager import ActionManager
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.plugin_system.base.component_types import ActionInfo, ChatMode, ComponentType
|
||||
from src.plugin_system.core.component_registry import component_registry
|
||||
|
||||
logger = get_logger("planner")
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
|
||||
def init_prompt():
|
||||
Prompt(
|
||||
"""
|
||||
{time_block}
|
||||
{identity_block}
|
||||
你现在需要根据聊天内容,选择的合适的action来参与聊天。
|
||||
{chat_context_description},以下是具体的聊天内容
|
||||
{chat_content_block}
|
||||
|
||||
{moderation_prompt}
|
||||
|
||||
现在请你根据{by_what}选择合适的action和触发action的消息:
|
||||
{actions_before_now_block}
|
||||
|
||||
{no_action_block}
|
||||
{action_options_text}
|
||||
|
||||
你必须从上面列出的可用action中选择一个,并说明触发action的消息id(不是消息原文)和选择该action的原因。
|
||||
|
||||
请根据动作示例,以严格的 JSON 格式输出,且仅包含 JSON 内容:
|
||||
""",
|
||||
"planner_prompt",
|
||||
)
|
||||
|
||||
Prompt(
|
||||
"""
|
||||
动作:{action_name}
|
||||
动作描述:{action_description}
|
||||
{action_require}
|
||||
{{
|
||||
"action": "{action_name}",{action_parameters}{target_prompt}
|
||||
"reason":"触发action的原因"
|
||||
}}
|
||||
""",
|
||||
"action_prompt",
|
||||
)
|
||||
|
||||
|
||||
class ActionPlanner:
|
||||
def __init__(self, chat_id: str, action_manager: ActionManager):
|
||||
self.chat_id = chat_id
|
||||
self.log_prefix = f"[{get_chat_manager().get_stream_name(chat_id) or chat_id}]"
|
||||
self.action_manager = action_manager
|
||||
# LLM规划器配置
|
||||
self.planner_llm = LLMRequest(
|
||||
model_set=model_config.model_task_config.planner, request_type="planner"
|
||||
) # 用于动作规划
|
||||
|
||||
self.last_obs_time_mark = 0.0
|
||||
# 添加重试计数器
|
||||
self.plan_retry_count = 0
|
||||
self.max_plan_retries = 3
|
||||
|
||||
def find_message_by_id(self, message_id: str, message_id_list: list) -> Optional[Dict[str, Any]]:
|
||||
# sourcery skip: use-next
|
||||
"""
|
||||
根据message_id从message_id_list中查找对应的原始消息
|
||||
|
||||
Args:
|
||||
message_id: 要查找的消息ID
|
||||
message_id_list: 消息ID列表,格式为[{'id': str, 'message': dict}, ...]
|
||||
|
||||
Returns:
|
||||
找到的原始消息字典,如果未找到则返回None
|
||||
"""
|
||||
for item in message_id_list:
|
||||
if item.get("id") == message_id:
|
||||
return item.get("message")
|
||||
return None
|
||||
|
||||
def get_latest_message(self, message_id_list: list) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
获取消息列表中的最新消息
|
||||
|
||||
Args:
|
||||
message_id_list: 消息ID列表,格式为[{'id': str, 'message': dict}, ...]
|
||||
|
||||
Returns:
|
||||
最新的消息字典,如果列表为空则返回None
|
||||
"""
|
||||
if not message_id_list:
|
||||
return None
|
||||
# 假设消息列表是按时间顺序排列的,最后一个是最新的
|
||||
return message_id_list[-1].get("message")
|
||||
|
||||
async def plan(
|
||||
self, mode: ChatMode = ChatMode.FOCUS
|
||||
) -> Tuple[Dict[str, Dict[str, Any] | str], Optional[Dict[str, Any]]]:
|
||||
"""
|
||||
规划器 (Planner): 使用LLM根据上下文决定做出什么动作。
|
||||
"""
|
||||
|
||||
action = "no_reply" # 默认动作
|
||||
reasoning = "规划器初始化默认"
|
||||
action_data = {}
|
||||
current_available_actions: Dict[str, ActionInfo] = {}
|
||||
target_message: Optional[Dict[str, Any]] = None # 初始化target_message变量
|
||||
prompt: str = ""
|
||||
message_id_list: list = []
|
||||
|
||||
try:
|
||||
is_group_chat, chat_target_info, current_available_actions = self.get_necessary_info()
|
||||
|
||||
# --- 构建提示词 (调用修改后的 PromptBuilder 方法) ---
|
||||
prompt, message_id_list = await self.build_planner_prompt(
|
||||
is_group_chat=is_group_chat, # <-- Pass HFC state
|
||||
chat_target_info=chat_target_info, # <-- 传递获取到的聊天目标信息
|
||||
current_available_actions=current_available_actions, # <-- Pass determined actions
|
||||
mode=mode,
|
||||
)
|
||||
|
||||
# --- 调用 LLM (普通文本生成) ---
|
||||
llm_content = None
|
||||
try:
|
||||
llm_content, (reasoning_content, _, _) = await self.planner_llm.generate_response_async(prompt=prompt)
|
||||
|
||||
if global_config.debug.show_prompt:
|
||||
logger.info(f"{self.log_prefix}规划器原始提示词: {prompt}")
|
||||
logger.info(f"{self.log_prefix}规划器原始响应: {llm_content}")
|
||||
if reasoning_content:
|
||||
logger.info(f"{self.log_prefix}规划器推理: {reasoning_content}")
|
||||
else:
|
||||
logger.debug(f"{self.log_prefix}规划器原始提示词: {prompt}")
|
||||
logger.debug(f"{self.log_prefix}规划器原始响应: {llm_content}")
|
||||
if reasoning_content:
|
||||
logger.debug(f"{self.log_prefix}规划器推理: {reasoning_content}")
|
||||
|
||||
except Exception as req_e:
|
||||
logger.error(f"{self.log_prefix}LLM 请求执行失败: {req_e}")
|
||||
reasoning = f"LLM 请求失败,模型出现问题: {req_e}"
|
||||
action = "no_reply"
|
||||
|
||||
if llm_content:
|
||||
try:
|
||||
parsed_json = json.loads(repair_json(llm_content))
|
||||
|
||||
if isinstance(parsed_json, list):
|
||||
if parsed_json:
|
||||
parsed_json = parsed_json[-1]
|
||||
logger.warning(f"{self.log_prefix}LLM返回了多个JSON对象,使用最后一个: {parsed_json}")
|
||||
else:
|
||||
parsed_json = {}
|
||||
|
||||
if not isinstance(parsed_json, dict):
|
||||
logger.error(f"{self.log_prefix}解析后的JSON不是字典类型: {type(parsed_json)}")
|
||||
parsed_json = {}
|
||||
|
||||
action = parsed_json.get("action", "no_reply")
|
||||
reasoning = parsed_json.get("reasoning", "未提供原因")
|
||||
|
||||
# 将所有其他属性添加到action_data
|
||||
for key, value in parsed_json.items():
|
||||
if key not in ["action", "reasoning"]:
|
||||
action_data[key] = value
|
||||
|
||||
# 在FOCUS模式下,非no_reply动作需要target_message_id
|
||||
if mode == ChatMode.FOCUS and action != "no_reply":
|
||||
if target_message_id := parsed_json.get("target_message_id"):
|
||||
# 根据target_message_id查找原始消息
|
||||
target_message = self.find_message_by_id(target_message_id, message_id_list)
|
||||
# target_message = None
|
||||
# 如果获取的target_message为None,输出warning并重新plan
|
||||
if target_message is None:
|
||||
self.plan_retry_count += 1
|
||||
logger.warning(f"{self.log_prefix}无法找到target_message_id '{target_message_id}' 对应的消息,重试次数: {self.plan_retry_count}/{self.max_plan_retries}")
|
||||
|
||||
# 如果连续三次plan均为None,输出error并选取最新消息
|
||||
if self.plan_retry_count >= self.max_plan_retries:
|
||||
logger.error(f"{self.log_prefix}连续{self.max_plan_retries}次plan获取target_message失败,选择最新消息作为target_message")
|
||||
target_message = self.get_latest_message(message_id_list)
|
||||
self.plan_retry_count = 0 # 重置计数器
|
||||
else:
|
||||
# 递归重新plan
|
||||
return await self.plan(mode)
|
||||
else:
|
||||
# 成功获取到target_message,重置计数器
|
||||
self.plan_retry_count = 0
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix}FOCUS模式下动作'{action}'缺少target_message_id")
|
||||
|
||||
if action == "no_action":
|
||||
reasoning = "normal决定不使用额外动作"
|
||||
elif action != "no_reply" and action != "reply" and action not in current_available_actions:
|
||||
logger.warning(
|
||||
f"{self.log_prefix}LLM 返回了当前不可用或无效的动作: '{action}' (可用: {list(current_available_actions.keys())}),将强制使用 'no_reply'"
|
||||
)
|
||||
reasoning = f"LLM 返回了当前不可用的动作 '{action}' (可用: {list(current_available_actions.keys())})。原始理由: {reasoning}"
|
||||
action = "no_reply"
|
||||
|
||||
except Exception as json_e:
|
||||
logger.warning(f"{self.log_prefix}解析LLM响应JSON失败 {json_e}. LLM原始输出: '{llm_content}'")
|
||||
traceback.print_exc()
|
||||
reasoning = f"解析LLM响应JSON失败: {json_e}. 将使用默认动作 'no_reply'."
|
||||
action = "no_reply"
|
||||
|
||||
except Exception as outer_e:
|
||||
logger.error(f"{self.log_prefix}Planner 处理过程中发生意外错误,规划失败,将执行 no_reply: {outer_e}")
|
||||
traceback.print_exc()
|
||||
action = "no_reply"
|
||||
reasoning = f"Planner 内部处理错误: {outer_e}"
|
||||
|
||||
is_parallel = False
|
||||
if mode == ChatMode.NORMAL and action in current_available_actions:
|
||||
is_parallel = current_available_actions[action].parallel_action
|
||||
|
||||
action_result = {
|
||||
"action_type": action,
|
||||
"action_data": action_data,
|
||||
"reasoning": reasoning,
|
||||
"timestamp": time.time(),
|
||||
"is_parallel": is_parallel,
|
||||
}
|
||||
|
||||
return (
|
||||
{
|
||||
"action_result": action_result,
|
||||
"action_prompt": prompt,
|
||||
},
|
||||
target_message,
|
||||
)
|
||||
|
||||
async def build_planner_prompt(
|
||||
self,
|
||||
is_group_chat: bool, # Now passed as argument
|
||||
chat_target_info: Optional[dict], # Now passed as argument
|
||||
current_available_actions: Dict[str, ActionInfo],
|
||||
mode: ChatMode = ChatMode.FOCUS,
|
||||
) -> tuple[str, list]: # sourcery skip: use-join
|
||||
"""构建 Planner LLM 的提示词 (获取模板并填充数据)"""
|
||||
try:
|
||||
message_list_before_now = get_raw_msg_before_timestamp_with_chat(
|
||||
chat_id=self.chat_id,
|
||||
timestamp=time.time(),
|
||||
limit=int(global_config.chat.max_context_size * 0.6),
|
||||
)
|
||||
|
||||
chat_content_block, message_id_list = build_readable_messages_with_id(
|
||||
messages=message_list_before_now,
|
||||
timestamp_mode="normal_no_YMD",
|
||||
read_mark=self.last_obs_time_mark,
|
||||
truncate=True,
|
||||
show_actions=True,
|
||||
)
|
||||
|
||||
actions_before_now = get_actions_by_timestamp_with_chat(
|
||||
chat_id=self.chat_id,
|
||||
timestamp_start=time.time() - 3600,
|
||||
timestamp_end=time.time(),
|
||||
limit=5,
|
||||
)
|
||||
|
||||
actions_before_now_block = build_readable_actions(
|
||||
actions=actions_before_now,
|
||||
)
|
||||
|
||||
actions_before_now_block = f"你刚刚选择并执行过的action是:\n{actions_before_now_block}"
|
||||
|
||||
self.last_obs_time_mark = time.time()
|
||||
|
||||
if mode == ChatMode.FOCUS:
|
||||
mentioned_bonus = ""
|
||||
if global_config.chat.mentioned_bot_inevitable_reply:
|
||||
mentioned_bonus = "\n- 有人提到你"
|
||||
if global_config.chat.at_bot_inevitable_reply:
|
||||
mentioned_bonus = "\n- 有人提到你,或者at你"
|
||||
|
||||
by_what = "聊天内容"
|
||||
target_prompt = '\n "target_message_id":"触发action的消息id"'
|
||||
no_action_block = f"""重要说明:
|
||||
- 'no_reply' 表示只进行不进行回复,等待合适的回复时机
|
||||
- 当你刚刚发送了消息,没有人回复时,选择no_reply
|
||||
- 当你一次发送了太多消息,为了避免打扰聊天节奏,选择no_reply
|
||||
|
||||
动作:reply
|
||||
动作描述:参与聊天回复,发送文本进行表达
|
||||
- 你想要闲聊或者随便附和{mentioned_bonus}
|
||||
- 如果你刚刚进行了回复,不要对同一个话题重复回应
|
||||
{{
|
||||
"action": "reply",
|
||||
"target_message_id":"触发action的消息id",
|
||||
"reason":"回复的原因"
|
||||
}}
|
||||
|
||||
"""
|
||||
else:
|
||||
by_what = "聊天内容和用户的最新消息"
|
||||
target_prompt = ""
|
||||
no_action_block = """重要说明:
|
||||
- 'reply' 表示只进行普通聊天回复,不执行任何额外动作
|
||||
- 其他action表示在普通回复的基础上,执行相应的额外动作"""
|
||||
|
||||
chat_context_description = "你现在正在一个群聊中"
|
||||
chat_target_name = None # Only relevant for private
|
||||
if not is_group_chat and chat_target_info:
|
||||
chat_target_name = (
|
||||
chat_target_info.get("person_name") or chat_target_info.get("user_nickname") or "对方"
|
||||
)
|
||||
chat_context_description = f"你正在和 {chat_target_name} 私聊"
|
||||
|
||||
action_options_block = ""
|
||||
|
||||
for using_actions_name, using_actions_info in current_available_actions.items():
|
||||
if using_actions_info.action_parameters:
|
||||
param_text = "\n"
|
||||
for param_name, param_description in using_actions_info.action_parameters.items():
|
||||
param_text += f' "{param_name}":"{param_description}"\n'
|
||||
param_text = param_text.rstrip("\n")
|
||||
else:
|
||||
param_text = ""
|
||||
|
||||
require_text = ""
|
||||
for require_item in using_actions_info.action_require:
|
||||
require_text += f"- {require_item}\n"
|
||||
require_text = require_text.rstrip("\n")
|
||||
|
||||
using_action_prompt = await global_prompt_manager.get_prompt_async("action_prompt")
|
||||
using_action_prompt = using_action_prompt.format(
|
||||
action_name=using_actions_name,
|
||||
action_description=using_actions_info.description,
|
||||
action_parameters=param_text,
|
||||
action_require=require_text,
|
||||
target_prompt=target_prompt,
|
||||
)
|
||||
|
||||
action_options_block += using_action_prompt
|
||||
|
||||
moderation_prompt_block = "请不要输出违法违规内容,不要输出色情,暴力,政治相关内容,如有敏感内容,请规避。"
|
||||
|
||||
time_block = f"当前时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
|
||||
bot_name = global_config.bot.nickname
|
||||
if global_config.bot.alias_names:
|
||||
bot_nickname = f",也有人叫你{','.join(global_config.bot.alias_names)}"
|
||||
else:
|
||||
bot_nickname = ""
|
||||
bot_core_personality = global_config.personality.personality_core
|
||||
identity_block = f"你的名字是{bot_name}{bot_nickname},你{bot_core_personality}:"
|
||||
|
||||
planner_prompt_template = await global_prompt_manager.get_prompt_async("planner_prompt")
|
||||
prompt = planner_prompt_template.format(
|
||||
time_block=time_block,
|
||||
by_what=by_what,
|
||||
chat_context_description=chat_context_description,
|
||||
chat_content_block=chat_content_block,
|
||||
actions_before_now_block=actions_before_now_block,
|
||||
no_action_block=no_action_block,
|
||||
action_options_text=action_options_block,
|
||||
moderation_prompt=moderation_prompt_block,
|
||||
identity_block=identity_block,
|
||||
)
|
||||
return prompt, message_id_list
|
||||
except Exception as e:
|
||||
logger.error(f"构建 Planner 提示词时出错: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return "构建 Planner Prompt 时出错", []
|
||||
|
||||
def get_necessary_info(self) -> Tuple[bool, Optional[dict], Dict[str, ActionInfo]]:
|
||||
"""
|
||||
获取 Planner 需要的必要信息
|
||||
"""
|
||||
is_group_chat = True
|
||||
is_group_chat, chat_target_info = get_chat_type_and_target_info(self.chat_id)
|
||||
logger.debug(f"{self.log_prefix}获取到聊天信息 - 群聊: {is_group_chat}, 目标信息: {chat_target_info}")
|
||||
|
||||
current_available_actions_dict = self.action_manager.get_using_actions()
|
||||
|
||||
# 获取完整的动作信息
|
||||
all_registered_actions: Dict[str, ActionInfo] = component_registry.get_components_by_type( # type: ignore
|
||||
ComponentType.ACTION
|
||||
)
|
||||
current_available_actions = {}
|
||||
for action_name in current_available_actions_dict:
|
||||
if action_name in all_registered_actions:
|
||||
current_available_actions[action_name] = all_registered_actions[action_name]
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix}使用中的动作 {action_name} 未在已注册动作中找到")
|
||||
|
||||
return is_group_chat, chat_target_info, current_available_actions
|
||||
|
||||
|
||||
init_prompt()
|
||||
1139
src/chat/replyer/default_generator.py
Normal file
1139
src/chat/replyer/default_generator.py
Normal file
File diff suppressed because it is too large
Load Diff
61
src/chat/replyer/replyer_manager.py
Normal file
61
src/chat/replyer/replyer_manager.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from typing import Dict, Optional, List, Tuple
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.config.api_ada_configs import TaskConfig
|
||||
from src.chat.message_receive.chat_stream import ChatStream, get_chat_manager
|
||||
from src.chat.replyer.default_generator import DefaultReplyer
|
||||
|
||||
logger = get_logger("ReplyerManager")
|
||||
|
||||
|
||||
class ReplyerManager:
|
||||
def __init__(self):
|
||||
self._repliers: Dict[str, DefaultReplyer] = {}
|
||||
|
||||
def get_replyer(
|
||||
self,
|
||||
chat_stream: Optional[ChatStream] = None,
|
||||
chat_id: Optional[str] = None,
|
||||
model_set_with_weight: Optional[List[Tuple[TaskConfig, float]]] = None,
|
||||
request_type: str = "replyer",
|
||||
) -> Optional[DefaultReplyer]:
|
||||
"""
|
||||
获取或创建回复器实例。
|
||||
|
||||
model_configs 仅在首次为某个 chat_id/stream_id 创建实例时有效。
|
||||
后续调用将返回已缓存的实例,忽略 model_configs 参数。
|
||||
"""
|
||||
stream_id = chat_stream.stream_id if chat_stream else chat_id
|
||||
if not stream_id:
|
||||
logger.warning("[ReplyerManager] 缺少 stream_id,无法获取回复器。")
|
||||
return None
|
||||
|
||||
# 如果已有缓存实例,直接返回
|
||||
if stream_id in self._repliers:
|
||||
logger.debug(f"[ReplyerManager] 为 stream_id '{stream_id}' 返回已存在的回复器实例。")
|
||||
return self._repliers[stream_id]
|
||||
|
||||
# 如果没有缓存,则创建新实例(首次初始化)
|
||||
logger.debug(f"[ReplyerManager] 为 stream_id '{stream_id}' 创建新的回复器实例并缓存。")
|
||||
|
||||
target_stream = chat_stream
|
||||
if not target_stream:
|
||||
if chat_manager := get_chat_manager():
|
||||
target_stream = chat_manager.get_stream(stream_id)
|
||||
|
||||
if not target_stream:
|
||||
logger.warning(f"[ReplyerManager] 未找到 stream_id='{stream_id}' 的聊天流,无法创建回复器。")
|
||||
return None
|
||||
|
||||
# model_configs 只在此时(初始化时)生效
|
||||
replyer = DefaultReplyer(
|
||||
chat_stream=target_stream,
|
||||
model_set_with_weight=model_set_with_weight, # 可以是None,此时使用默认模型
|
||||
request_type=request_type,
|
||||
)
|
||||
self._repliers[stream_id] = replyer
|
||||
return replyer
|
||||
|
||||
|
||||
# 创建一个全局实例
|
||||
replyer_manager = ReplyerManager()
|
||||
1145
src/chat/utils/chat_message_builder.py
Normal file
1145
src/chat/utils/chat_message_builder.py
Normal file
File diff suppressed because it is too large
Load Diff
282
src/chat/utils/prompt_builder.py
Normal file
282
src/chat/utils/prompt_builder.py
Normal file
@@ -0,0 +1,282 @@
|
||||
import re
|
||||
import asyncio
|
||||
import contextvars
|
||||
|
||||
from rich.traceback import install
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Dict, Any, Optional, List, Union
|
||||
|
||||
from src.common.logger import get_logger
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
logger = get_logger("prompt_build")
|
||||
|
||||
|
||||
class PromptContext:
|
||||
def __init__(self):
|
||||
self._context_prompts: Dict[str, Dict[str, "Prompt"]] = {}
|
||||
# 使用contextvars创建协程上下文变量
|
||||
self._current_context_var = contextvars.ContextVar("current_context", default=None)
|
||||
self._context_lock = asyncio.Lock() # 保留锁用于其他操作
|
||||
|
||||
@property
|
||||
def _current_context(self) -> Optional[str]:
|
||||
"""获取当前协程的上下文ID"""
|
||||
return self._current_context_var.get()
|
||||
|
||||
@_current_context.setter
|
||||
def _current_context(self, value: Optional[str]):
|
||||
"""设置当前协程的上下文ID"""
|
||||
self._current_context_var.set(value)
|
||||
|
||||
@asynccontextmanager
|
||||
async def async_scope(self, context_id: Optional[str] = None):
|
||||
# sourcery skip: hoist-statement-from-if, use-contextlib-suppress
|
||||
"""创建一个异步的临时提示模板作用域"""
|
||||
# 保存当前上下文并设置新上下文
|
||||
if context_id is not None:
|
||||
try:
|
||||
# 添加超时保护,避免长时间等待锁
|
||||
await asyncio.wait_for(self._context_lock.acquire(), timeout=5.0)
|
||||
try:
|
||||
if context_id not in self._context_prompts:
|
||||
self._context_prompts[context_id] = {}
|
||||
finally:
|
||||
self._context_lock.release()
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"获取上下文锁超时,context_id: {context_id}")
|
||||
# 超时时直接进入,不设置上下文
|
||||
context_id = None
|
||||
|
||||
# 保存当前协程的上下文值,不影响其他协程
|
||||
previous_context = self._current_context
|
||||
# 设置当前协程的新上下文
|
||||
token = self._current_context_var.set(context_id) if context_id else None
|
||||
else:
|
||||
# 如果没有提供新上下文,保持当前上下文不变
|
||||
previous_context = self._current_context
|
||||
token = None
|
||||
|
||||
try:
|
||||
yield self
|
||||
finally:
|
||||
# 恢复之前的上下文,添加异常保护
|
||||
if context_id is not None and token is not None:
|
||||
try:
|
||||
self._current_context_var.reset(token)
|
||||
except Exception as e:
|
||||
logger.warning(f"恢复上下文时出错: {e}")
|
||||
# 如果reset失败,尝试直接设置
|
||||
try:
|
||||
self._current_context = previous_context
|
||||
except Exception:
|
||||
pass # 静默忽略恢复失败
|
||||
|
||||
async def get_prompt_async(self, name: str) -> Optional["Prompt"]:
|
||||
"""异步获取当前作用域中的提示模板"""
|
||||
async with self._context_lock:
|
||||
current_context = self._current_context
|
||||
logger.debug(f"获取提示词: {name} 当前上下文: {current_context}")
|
||||
if (
|
||||
current_context
|
||||
and current_context in self._context_prompts
|
||||
and name in self._context_prompts[current_context]
|
||||
):
|
||||
return self._context_prompts[current_context][name]
|
||||
return None
|
||||
|
||||
async def register_async(self, prompt: "Prompt", context_id: Optional[str] = None) -> None:
|
||||
"""异步注册提示模板到指定作用域"""
|
||||
async with self._context_lock:
|
||||
if target_context := context_id or self._current_context:
|
||||
self._context_prompts.setdefault(target_context, {})[prompt.name] = prompt
|
||||
|
||||
|
||||
class PromptManager:
|
||||
def __init__(self):
|
||||
self._prompts = {}
|
||||
self._counter = 0
|
||||
self._context = PromptContext()
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
@asynccontextmanager
|
||||
async def async_message_scope(self, message_id: Optional[str] = None):
|
||||
"""为消息处理创建异步临时作用域,支持 message_id 为 None 的情况"""
|
||||
async with self._context.async_scope(message_id):
|
||||
yield self
|
||||
|
||||
async def get_prompt_async(self, name: str) -> "Prompt":
|
||||
# 首先尝试从当前上下文获取
|
||||
context_prompt = await self._context.get_prompt_async(name)
|
||||
if context_prompt is not None:
|
||||
logger.debug(f"从上下文中获取提示词: {name} {context_prompt}")
|
||||
return context_prompt
|
||||
# 如果上下文中不存在,则使用全局提示模板
|
||||
async with self._lock:
|
||||
# logger.debug(f"从全局获取提示词: {name}")
|
||||
if name not in self._prompts:
|
||||
raise KeyError(f"Prompt '{name}' not found")
|
||||
return self._prompts[name]
|
||||
|
||||
def generate_name(self, template: str) -> str:
|
||||
"""为未命名的prompt生成名称"""
|
||||
self._counter += 1
|
||||
return f"prompt_{self._counter}"
|
||||
|
||||
def register(self, prompt: "Prompt") -> None:
|
||||
"""注册一个prompt"""
|
||||
if not prompt.name:
|
||||
prompt.name = self.generate_name(prompt.template)
|
||||
self._prompts[prompt.name] = prompt
|
||||
|
||||
def add_prompt(self, name: str, fstr: str) -> "Prompt":
|
||||
prompt = Prompt(fstr, name=name)
|
||||
self._prompts[prompt.name] = prompt
|
||||
return prompt
|
||||
|
||||
async def format_prompt(self, name: str, **kwargs) -> str:
|
||||
prompt = await self.get_prompt_async(name)
|
||||
return prompt.format(**kwargs)
|
||||
|
||||
|
||||
# 全局单例
|
||||
global_prompt_manager = PromptManager()
|
||||
|
||||
|
||||
class Prompt(str):
|
||||
# 临时标记,作为类常量
|
||||
_TEMP_LEFT_BRACE = "__ESCAPED_LEFT_BRACE__"
|
||||
_TEMP_RIGHT_BRACE = "__ESCAPED_RIGHT_BRACE__"
|
||||
|
||||
@staticmethod
|
||||
def _process_escaped_braces(template) -> str:
|
||||
"""处理模板中的转义花括号,将 \{ 和 \} 替换为临时标记""" # type: ignore
|
||||
# 如果传入的是列表,将其转换为字符串
|
||||
if isinstance(template, list):
|
||||
template = "\n".join(str(item) for item in template)
|
||||
elif not isinstance(template, str):
|
||||
template = str(template)
|
||||
|
||||
return template.replace("\\{", Prompt._TEMP_LEFT_BRACE).replace("\\}", Prompt._TEMP_RIGHT_BRACE)
|
||||
|
||||
@staticmethod
|
||||
def _restore_escaped_braces(template: str) -> str:
|
||||
"""将临时标记还原为实际的花括号字符"""
|
||||
return template.replace(Prompt._TEMP_LEFT_BRACE, "{").replace(Prompt._TEMP_RIGHT_BRACE, "}")
|
||||
|
||||
def __new__(cls, fstr, name: Optional[str] = None, args: Union[List[Any], tuple[Any, ...]] = None, **kwargs):
|
||||
# 如果传入的是元组,转换为列表
|
||||
if isinstance(args, tuple):
|
||||
args = list(args)
|
||||
should_register = kwargs.pop("_should_register", True)
|
||||
|
||||
# 预处理模板中的转义花括号
|
||||
processed_fstr = cls._process_escaped_braces(fstr)
|
||||
|
||||
# 解析模板
|
||||
template_args = []
|
||||
result = re.findall(r"\{(.*?)}", processed_fstr)
|
||||
for expr in result:
|
||||
if expr and expr not in template_args:
|
||||
template_args.append(expr)
|
||||
|
||||
# 如果提供了初始参数,立即格式化
|
||||
if kwargs or args:
|
||||
formatted = cls._format_template(fstr, args=args, kwargs=kwargs)
|
||||
obj = super().__new__(cls, formatted)
|
||||
else:
|
||||
obj = super().__new__(cls, "")
|
||||
|
||||
obj.template = fstr
|
||||
obj.name = name
|
||||
obj.args = template_args
|
||||
obj._args = args or []
|
||||
obj._kwargs = kwargs
|
||||
|
||||
# 修改自动注册逻辑
|
||||
if should_register and not global_prompt_manager._context._current_context:
|
||||
global_prompt_manager.register(obj)
|
||||
return obj
|
||||
|
||||
@classmethod
|
||||
async def create_async(
|
||||
cls, fstr, name: Optional[str] = None, args: Union[List[Any], tuple[Any, ...]] = None, **kwargs
|
||||
):
|
||||
"""异步创建Prompt实例"""
|
||||
prompt = cls(fstr, name, args, **kwargs)
|
||||
if global_prompt_manager._context._current_context:
|
||||
await global_prompt_manager._context.register_async(prompt)
|
||||
return prompt
|
||||
|
||||
@classmethod
|
||||
def _format_template(cls, template, args: List[Any] = None, kwargs: Dict[str, Any] = None) -> str:
|
||||
# 预处理模板中的转义花括号
|
||||
processed_template = cls._process_escaped_braces(template)
|
||||
|
||||
template_args = []
|
||||
result = re.findall(r"\{(.*?)}", processed_template)
|
||||
for expr in result:
|
||||
if expr and expr not in template_args:
|
||||
template_args.append(expr)
|
||||
formatted_args = {}
|
||||
formatted_kwargs = {}
|
||||
|
||||
# 处理位置参数
|
||||
if args:
|
||||
# print(len(template_args), len(args), template_args, args)
|
||||
for i in range(len(args)):
|
||||
if i < len(template_args):
|
||||
arg = args[i]
|
||||
if isinstance(arg, Prompt):
|
||||
formatted_args[template_args[i]] = arg.format(**kwargs)
|
||||
else:
|
||||
formatted_args[template_args[i]] = arg
|
||||
else:
|
||||
logger.error(
|
||||
f"构建提示词模板失败,解析到的参数列表{template_args},长度为{len(template_args)},输入的参数列表为{args},提示词模板为{template}"
|
||||
)
|
||||
raise ValueError("格式化模板失败")
|
||||
|
||||
# 处理关键字参数
|
||||
if kwargs:
|
||||
for key, value in kwargs.items():
|
||||
if isinstance(value, Prompt):
|
||||
remaining_kwargs = {k: v for k, v in kwargs.items() if k != key}
|
||||
formatted_kwargs[key] = value.format(**remaining_kwargs)
|
||||
else:
|
||||
formatted_kwargs[key] = value
|
||||
|
||||
try:
|
||||
# 先用位置参数格式化
|
||||
if args:
|
||||
processed_template = processed_template.format(**formatted_args)
|
||||
# 再用关键字参数格式化
|
||||
if kwargs:
|
||||
processed_template = processed_template.format(**formatted_kwargs)
|
||||
|
||||
# 将临时标记还原为实际的花括号
|
||||
result = cls._restore_escaped_braces(processed_template)
|
||||
return result
|
||||
except (IndexError, KeyError) as e:
|
||||
raise ValueError(
|
||||
f"格式化模板失败: {template}, args={formatted_args}, kwargs={formatted_kwargs} {str(e)}"
|
||||
) from e
|
||||
|
||||
def format(self, *args, **kwargs) -> "str":
|
||||
"""支持位置参数和关键字参数的格式化,使用"""
|
||||
ret = type(self)(
|
||||
self.template,
|
||||
self.name,
|
||||
args=list(args) if args else self._args,
|
||||
_should_register=False,
|
||||
**kwargs or self._kwargs,
|
||||
)
|
||||
# print(f"prompt build result: {ret} name: {ret.name} ")
|
||||
return str(ret)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return super().__str__() if self._kwargs or self._args else self.template
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Prompt(template='{self.template}', name='{self.name}')"
|
||||
1467
src/chat/utils/statistic.py
Normal file
1467
src/chat/utils/statistic.py
Normal file
File diff suppressed because it is too large
Load Diff
158
src/chat/utils/timer_calculator.py
Normal file
158
src/chat/utils/timer_calculator.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import asyncio
|
||||
|
||||
from time import perf_counter
|
||||
from functools import wraps
|
||||
from typing import Optional, Dict, Callable
|
||||
from rich.traceback import install
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
"""
|
||||
# 更好的计时器
|
||||
|
||||
使用形式:
|
||||
- 上下文
|
||||
- 装饰器
|
||||
- 直接实例化
|
||||
|
||||
使用场景:
|
||||
- 使用Timer:在需要测量代码执行时间时(如性能测试、计时器工具),Timer类是更可靠、高精度的选择。
|
||||
- 使用time.time()的场景:当需要记录实际时间点(如日志、时间戳)时使用,但避免用它测量时间间隔。
|
||||
|
||||
使用方式:
|
||||
|
||||
【装饰器】
|
||||
time_dict = {}
|
||||
@Timer("计数", time_dict)
|
||||
def func():
|
||||
pass
|
||||
print(time_dict)
|
||||
|
||||
【上下文_1】
|
||||
def func():
|
||||
with Timer() as t:
|
||||
pass
|
||||
print(t)
|
||||
print(t.human_readable)
|
||||
|
||||
【上下文_2】
|
||||
def func():
|
||||
time_dict = {}
|
||||
with Timer("计数", time_dict):
|
||||
pass
|
||||
print(time_dict)
|
||||
|
||||
【直接实例化】
|
||||
a = Timer()
|
||||
print(a) # 直接输出当前 perf_counter 值
|
||||
|
||||
参数:
|
||||
- name:计时器的名字,默认为 None
|
||||
- storage:计时器结果存储字典,默认为 None
|
||||
- auto_unit:自动选择单位(毫秒或秒),默认为 True(自动根据时间切换毫秒或秒)
|
||||
- do_type_check:是否进行类型检查,默认为 False(不进行类型检查)
|
||||
|
||||
属性:human_readable
|
||||
|
||||
自定义错误:TimerTypeError
|
||||
"""
|
||||
|
||||
|
||||
class TimerTypeError(TypeError):
|
||||
"""自定义类型错误"""
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, param, expected_type, actual_type):
|
||||
super().__init__(f"参数 '{param}' 类型错误,期望 {expected_type},实际得到 {actual_type.__name__}")
|
||||
|
||||
|
||||
class Timer:
|
||||
"""
|
||||
Timer 支持三种模式:
|
||||
1. 装饰器模式:用于测量函数/协程运行时间
|
||||
2. 上下文管理器模式:用于 with 语句块内部计时
|
||||
3. 直接实例化:如果不调用 __enter__,打印对象时将显示当前 perf_counter 的值
|
||||
"""
|
||||
|
||||
__slots__ = ("name", "storage", "elapsed", "auto_unit", "start")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: Optional[str] = None,
|
||||
storage: Optional[Dict[str, float]] = None,
|
||||
auto_unit: bool = True,
|
||||
do_type_check: bool = False,
|
||||
):
|
||||
if do_type_check:
|
||||
self._validate_types(name, storage)
|
||||
|
||||
self.name = name
|
||||
self.storage = storage
|
||||
self.elapsed: float = None # type: ignore
|
||||
|
||||
self.auto_unit = auto_unit
|
||||
self.start: float = None # type: ignore
|
||||
|
||||
@staticmethod
|
||||
def _validate_types(name, storage):
|
||||
"""类型检查"""
|
||||
if name is not None and not isinstance(name, str):
|
||||
raise TimerTypeError("name", "Optional[str]", type(name))
|
||||
|
||||
if storage is not None and not isinstance(storage, dict):
|
||||
raise TimerTypeError("storage", "Optional[dict]", type(storage))
|
||||
|
||||
def __call__(self, func: Optional[Callable] = None) -> Callable:
|
||||
"""装饰器模式"""
|
||||
if func is None:
|
||||
return lambda f: Timer(name=self.name or f.__name__, storage=self.storage, auto_unit=self.auto_unit)(f)
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
with self:
|
||||
return await func(*args, **kwargs)
|
||||
return None
|
||||
|
||||
@wraps(func)
|
||||
def sync_wrapper(*args, **kwargs):
|
||||
with self:
|
||||
return func(*args, **kwargs)
|
||||
return None
|
||||
|
||||
wrapper = async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
|
||||
wrapper.__timer__ = self # 保留计时器引用 # type: ignore
|
||||
return wrapper
|
||||
|
||||
def __enter__(self):
|
||||
"""上下文管理器入口"""
|
||||
self.start = perf_counter()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.elapsed = perf_counter() - self.start
|
||||
self._record_time()
|
||||
return False
|
||||
|
||||
def _record_time(self):
|
||||
"""记录时间"""
|
||||
if self.storage is not None and self.name:
|
||||
self.storage[self.name] = self.elapsed
|
||||
|
||||
@property
|
||||
def human_readable(self) -> str:
|
||||
"""人类可读时间格式"""
|
||||
if self.elapsed is None:
|
||||
return "未计时"
|
||||
|
||||
if self.auto_unit:
|
||||
return f"{self.elapsed * 1000:.2f}毫秒" if self.elapsed < 1 else f"{self.elapsed:.2f}秒"
|
||||
return f"{self.elapsed:.4f}秒"
|
||||
|
||||
def __str__(self):
|
||||
if self.start is not None:
|
||||
if self.elapsed is None:
|
||||
current_elapsed = perf_counter() - self.start
|
||||
return f"<Timer {self.name or '匿名'} [计时中: {current_elapsed:.4f}秒]>"
|
||||
return f"<Timer {self.name or '匿名'} [{self.human_readable}]>"
|
||||
return f"{perf_counter()}"
|
||||
477
src/chat/utils/typo_generator.py
Normal file
477
src/chat/utils/typo_generator.py
Normal file
@@ -0,0 +1,477 @@
|
||||
"""
|
||||
错别字生成器 - 基于拼音和字频的中文错别字生成工具
|
||||
"""
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import jieba
|
||||
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from pypinyin import Style, pinyin
|
||||
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("typo_gen")
|
||||
|
||||
|
||||
class ChineseTypoGenerator:
|
||||
def __init__(self, error_rate=0.3, min_freq=5, tone_error_rate=0.2, word_replace_rate=0.3, max_freq_diff=200):
|
||||
"""
|
||||
初始化错别字生成器
|
||||
|
||||
参数:
|
||||
error_rate: 单字替换概率
|
||||
min_freq: 最小字频阈值
|
||||
tone_error_rate: 声调错误概率
|
||||
word_replace_rate: 整词替换概率
|
||||
max_freq_diff: 最大允许的频率差异
|
||||
"""
|
||||
self.error_rate = error_rate
|
||||
self.min_freq = min_freq
|
||||
self.tone_error_rate = tone_error_rate
|
||||
self.word_replace_rate = word_replace_rate
|
||||
self.max_freq_diff = max_freq_diff
|
||||
|
||||
# 加载数据
|
||||
# print("正在加载汉字数据库,请稍候...")
|
||||
# logger.info("正在加载汉字数据库,请稍候...")
|
||||
|
||||
self.pinyin_dict = self._create_pinyin_dict()
|
||||
self.char_frequency = self._load_or_create_char_frequency()
|
||||
|
||||
def _load_or_create_char_frequency(self):
|
||||
"""
|
||||
加载或创建汉字频率字典
|
||||
"""
|
||||
cache_file = Path("depends-data/char_frequency.json")
|
||||
|
||||
# 如果缓存文件存在,直接加载
|
||||
if cache_file.exists():
|
||||
with open(cache_file, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
# 使用内置的词频文件
|
||||
char_freq = defaultdict(int)
|
||||
dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
|
||||
|
||||
# 读取jieba的词典文件
|
||||
with open(dict_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
word, freq = line.strip().split()[:2]
|
||||
# 对词中的每个字进行频率累加
|
||||
for char in word:
|
||||
if self._is_chinese_char(char):
|
||||
char_freq[char] += int(freq)
|
||||
|
||||
# 归一化频率值
|
||||
max_freq = max(char_freq.values())
|
||||
normalized_freq = {char: freq / max_freq * 1000 for char, freq in char_freq.items()}
|
||||
|
||||
# 保存到缓存文件
|
||||
with open(cache_file, "w", encoding="utf-8") as f:
|
||||
json.dump(normalized_freq, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return normalized_freq
|
||||
|
||||
@staticmethod
|
||||
def _create_pinyin_dict():
|
||||
"""
|
||||
创建拼音到汉字的映射字典
|
||||
"""
|
||||
# 常用汉字范围
|
||||
chars = [chr(i) for i in range(0x4E00, 0x9FFF)]
|
||||
pinyin_dict = defaultdict(list)
|
||||
|
||||
# 为每个汉字建立拼音映射
|
||||
for char in chars:
|
||||
try:
|
||||
py = pinyin(char, style=Style.TONE3)[0][0]
|
||||
pinyin_dict[py].append(char)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return pinyin_dict
|
||||
|
||||
@staticmethod
|
||||
def _is_chinese_char(char):
|
||||
"""
|
||||
判断是否为汉字
|
||||
"""
|
||||
try:
|
||||
return "\u4e00" <= char <= "\u9fff"
|
||||
except Exception as e:
|
||||
logger.debug(str(e))
|
||||
return False
|
||||
|
||||
def _get_pinyin(self, sentence):
|
||||
"""
|
||||
将中文句子拆分成单个汉字并获取其拼音
|
||||
"""
|
||||
# 将句子拆分成单个字符
|
||||
characters = list(sentence)
|
||||
|
||||
# 获取每个字符的拼音
|
||||
result = []
|
||||
for char in characters:
|
||||
# 跳过空格和非汉字字符
|
||||
if char.isspace() or not self._is_chinese_char(char):
|
||||
continue
|
||||
# 获取拼音(数字声调)
|
||||
py = pinyin(char, style=Style.TONE3)[0][0]
|
||||
result.append((char, py))
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _get_similar_tone_pinyin(py):
|
||||
"""
|
||||
获取相似声调的拼音
|
||||
"""
|
||||
# 检查拼音是否为空或无效
|
||||
if not py or len(py) < 1:
|
||||
return py
|
||||
|
||||
# 如果最后一个字符不是数字,说明可能是轻声或其他特殊情况
|
||||
if not py[-1].isdigit():
|
||||
# 为非数字结尾的拼音添加数字声调1
|
||||
return f"{py}1"
|
||||
|
||||
base = py[:-1] # 去掉声调
|
||||
tone = int(py[-1]) # 获取声调
|
||||
|
||||
# 处理轻声(通常用5表示)或无效声调
|
||||
if tone not in [1, 2, 3, 4]:
|
||||
return base + str(random.choice([1, 2, 3, 4]))
|
||||
|
||||
# 正常处理声调
|
||||
possible_tones = [1, 2, 3, 4]
|
||||
possible_tones.remove(tone) # 移除原声调
|
||||
new_tone = random.choice(possible_tones) # 随机选择一个新声调
|
||||
return base + str(new_tone)
|
||||
|
||||
def _calculate_replacement_probability(self, orig_freq, target_freq):
|
||||
"""
|
||||
根据频率差计算替换概率
|
||||
"""
|
||||
if target_freq > orig_freq:
|
||||
return 1.0 # 如果替换字频率更高,保持原有概率
|
||||
|
||||
freq_diff = orig_freq - target_freq
|
||||
if freq_diff > self.max_freq_diff:
|
||||
return 0.0 # 频率差太大,不替换
|
||||
|
||||
# 使用指数衰减函数计算概率
|
||||
# 频率差为0时概率为1,频率差为max_freq_diff时概率接近0
|
||||
return math.exp(-3 * freq_diff / self.max_freq_diff)
|
||||
|
||||
def _get_similar_frequency_chars(self, char, py, num_candidates=5):
|
||||
"""
|
||||
获取与给定字频率相近的同音字,可能包含声调错误
|
||||
"""
|
||||
homophones = []
|
||||
|
||||
# 有一定概率使用错误声调
|
||||
if random.random() < self.tone_error_rate:
|
||||
wrong_tone_py = self._get_similar_tone_pinyin(py)
|
||||
homophones.extend(self.pinyin_dict[wrong_tone_py])
|
||||
|
||||
# 添加正确声调的同音字
|
||||
homophones.extend(self.pinyin_dict[py])
|
||||
|
||||
if not homophones:
|
||||
return None
|
||||
|
||||
# 获取原字的频率
|
||||
orig_freq = self.char_frequency.get(char, 0)
|
||||
|
||||
# 计算所有同音字与原字的频率差,并过滤掉低频字
|
||||
freq_diff = [
|
||||
(h, self.char_frequency.get(h, 0))
|
||||
for h in homophones
|
||||
if h != char and self.char_frequency.get(h, 0) >= self.min_freq
|
||||
]
|
||||
|
||||
if not freq_diff:
|
||||
return None
|
||||
|
||||
# 计算每个候选字的替换概率
|
||||
candidates_with_prob = []
|
||||
for h, freq in freq_diff:
|
||||
prob = self._calculate_replacement_probability(orig_freq, freq)
|
||||
if prob > 0: # 只保留有效概率的候选字
|
||||
candidates_with_prob.append((h, prob))
|
||||
|
||||
if not candidates_with_prob:
|
||||
return None
|
||||
|
||||
# 根据概率排序
|
||||
candidates_with_prob.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# 返回概率最高的几个字
|
||||
return [char for char, _ in candidates_with_prob[:num_candidates]]
|
||||
|
||||
@staticmethod
|
||||
def _get_word_pinyin(word):
|
||||
"""
|
||||
获取词语的拼音列表
|
||||
"""
|
||||
return [py[0] for py in pinyin(word, style=Style.TONE3)]
|
||||
|
||||
@staticmethod
|
||||
def _segment_sentence(sentence):
|
||||
"""
|
||||
使用jieba分词,返回词语列表
|
||||
"""
|
||||
return list(jieba.cut(sentence))
|
||||
|
||||
def _get_word_homophones(self, word):
|
||||
"""
|
||||
获取整个词的同音词,只返回高频的有意义词语
|
||||
"""
|
||||
if len(word) == 1:
|
||||
return []
|
||||
|
||||
# 获取词的拼音
|
||||
word_pinyin = self._get_word_pinyin(word)
|
||||
|
||||
# 遍历所有可能的同音字组合
|
||||
candidates = []
|
||||
for py in word_pinyin:
|
||||
chars = self.pinyin_dict.get(py, [])
|
||||
if not chars:
|
||||
return []
|
||||
candidates.append(chars)
|
||||
|
||||
# 生成所有可能的组合
|
||||
import itertools
|
||||
|
||||
all_combinations = itertools.product(*candidates)
|
||||
|
||||
# 获取jieba词典和词频信息
|
||||
dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
|
||||
valid_words = {} # 改用字典存储词语及其频率
|
||||
with open(dict_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) >= 2:
|
||||
word_text = parts[0]
|
||||
word_freq = float(parts[1]) # 获取词频
|
||||
valid_words[word_text] = word_freq
|
||||
|
||||
# 获取原词的词频作为参考
|
||||
original_word_freq = valid_words.get(word, 0)
|
||||
min_word_freq = original_word_freq * 0.1 # 设置最小词频为原词频的10%
|
||||
|
||||
# 过滤和计算频率
|
||||
homophones = []
|
||||
for combo in all_combinations:
|
||||
new_word = "".join(combo)
|
||||
if new_word != word and new_word in valid_words:
|
||||
new_word_freq = valid_words[new_word]
|
||||
# 只保留词频达到阈值的词
|
||||
if new_word_freq >= min_word_freq:
|
||||
# 计算词的平均字频(考虑字频和词频)
|
||||
char_avg_freq = sum(self.char_frequency.get(c, 0) for c in new_word) / len(new_word)
|
||||
# 综合评分:结合词频和字频
|
||||
combined_score = new_word_freq * 0.7 + char_avg_freq * 0.3
|
||||
if combined_score >= self.min_freq:
|
||||
homophones.append((new_word, combined_score))
|
||||
|
||||
# 按综合分数排序并限制返回数量
|
||||
sorted_homophones = sorted(homophones, key=lambda x: x[1], reverse=True)
|
||||
return [word for word, _ in sorted_homophones[:5]] # 限制返回前5个结果
|
||||
|
||||
def create_typo_sentence(self, sentence):
|
||||
"""
|
||||
创建包含同音字错误的句子,支持词语级别和字级别的替换
|
||||
|
||||
参数:
|
||||
sentence: 输入的中文句子
|
||||
|
||||
返回:
|
||||
typo_sentence: 包含错别字的句子
|
||||
correction_suggestion: 随机选择的一个纠正建议,返回正确的字/词
|
||||
"""
|
||||
result = []
|
||||
typo_info = []
|
||||
word_typos = [] # 记录词语错误对(错词,正确词)
|
||||
char_typos = [] # 记录单字错误对(错字,正确字)
|
||||
current_pos = 0
|
||||
|
||||
# 分词
|
||||
words = self._segment_sentence(sentence)
|
||||
|
||||
for word in words:
|
||||
# 如果是标点符号或空格,直接添加
|
||||
if all(not self._is_chinese_char(c) for c in word):
|
||||
result.append(word)
|
||||
current_pos += len(word)
|
||||
continue
|
||||
|
||||
# 获取词语的拼音
|
||||
word_pinyin = self._get_word_pinyin(word)
|
||||
|
||||
# 尝试整词替换
|
||||
if len(word) > 1 and random.random() < self.word_replace_rate:
|
||||
word_homophones = self._get_word_homophones(word)
|
||||
if word_homophones:
|
||||
typo_word = random.choice(word_homophones)
|
||||
# 计算词的平均频率
|
||||
orig_freq = sum(self.char_frequency.get(c, 0) for c in word) / len(word)
|
||||
typo_freq = sum(self.char_frequency.get(c, 0) for c in typo_word) / len(typo_word)
|
||||
|
||||
# 添加到结果中
|
||||
result.append(typo_word)
|
||||
typo_info.append(
|
||||
(
|
||||
word,
|
||||
typo_word,
|
||||
" ".join(word_pinyin),
|
||||
" ".join(self._get_word_pinyin(typo_word)),
|
||||
orig_freq,
|
||||
typo_freq,
|
||||
)
|
||||
)
|
||||
word_typos.append((typo_word, word)) # 记录(错词,正确词)对
|
||||
current_pos += len(typo_word)
|
||||
continue
|
||||
|
||||
# 如果不进行整词替换,则进行单字替换
|
||||
if len(word) == 1:
|
||||
char = word
|
||||
py = word_pinyin[0]
|
||||
if random.random() < self.error_rate:
|
||||
similar_chars = self._get_similar_frequency_chars(char, py)
|
||||
if similar_chars:
|
||||
typo_char = random.choice(similar_chars)
|
||||
typo_freq = self.char_frequency.get(typo_char, 0)
|
||||
orig_freq = self.char_frequency.get(char, 0)
|
||||
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
|
||||
if random.random() < replace_prob:
|
||||
result.append(typo_char)
|
||||
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
||||
typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq))
|
||||
char_typos.append((typo_char, char)) # 记录(错字,正确字)对
|
||||
current_pos += 1
|
||||
continue
|
||||
result.append(char)
|
||||
current_pos += 1
|
||||
else:
|
||||
# 处理多字词的单字替换
|
||||
word_result = []
|
||||
for _, (char, py) in enumerate(zip(word, word_pinyin, strict=False)):
|
||||
# 词中的字替换概率降低
|
||||
word_error_rate = self.error_rate * (0.7 ** (len(word) - 1))
|
||||
|
||||
if random.random() < word_error_rate:
|
||||
similar_chars = self._get_similar_frequency_chars(char, py)
|
||||
if similar_chars:
|
||||
typo_char = random.choice(similar_chars)
|
||||
typo_freq = self.char_frequency.get(typo_char, 0)
|
||||
orig_freq = self.char_frequency.get(char, 0)
|
||||
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
|
||||
if random.random() < replace_prob:
|
||||
word_result.append(typo_char)
|
||||
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
||||
typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq))
|
||||
char_typos.append((typo_char, char)) # 记录(错字,正确字)对
|
||||
continue
|
||||
word_result.append(char)
|
||||
result.append("".join(word_result))
|
||||
current_pos += len(word)
|
||||
|
||||
# 优先从词语错误中选择,如果没有则从单字错误中选择
|
||||
correction_suggestion = None
|
||||
# 50%概率返回纠正建议
|
||||
if random.random() < 0.5:
|
||||
if word_typos:
|
||||
wrong_word, correct_word = random.choice(word_typos)
|
||||
correction_suggestion = correct_word
|
||||
elif char_typos:
|
||||
wrong_char, correct_char = random.choice(char_typos)
|
||||
correction_suggestion = correct_char
|
||||
|
||||
return "".join(result), correction_suggestion
|
||||
|
||||
@staticmethod
|
||||
def format_typo_info(typo_info):
|
||||
"""
|
||||
格式化错别字信息
|
||||
|
||||
参数:
|
||||
typo_info: 错别字信息列表
|
||||
|
||||
返回:
|
||||
格式化后的错别字信息字符串
|
||||
"""
|
||||
if not typo_info:
|
||||
return "未生成错别字"
|
||||
|
||||
result = []
|
||||
for orig, typo, orig_py, typo_py, orig_freq, typo_freq in typo_info:
|
||||
# 判断是否为词语替换
|
||||
is_word = " " in orig_py
|
||||
if is_word:
|
||||
error_type = "整词替换"
|
||||
else:
|
||||
tone_error = orig_py[:-1] == typo_py[:-1] and orig_py[-1] != typo_py[-1]
|
||||
error_type = "声调错误" if tone_error else "同音字替换"
|
||||
|
||||
result.append(
|
||||
f"原文:{orig}({orig_py}) [频率:{orig_freq:.2f}] -> "
|
||||
f"替换:{typo}({typo_py}) [频率:{typo_freq:.2f}] [{error_type}]"
|
||||
)
|
||||
|
||||
return "\n".join(result)
|
||||
|
||||
def set_params(self, **kwargs):
|
||||
"""
|
||||
设置参数
|
||||
|
||||
可设置参数:
|
||||
error_rate: 单字替换概率
|
||||
min_freq: 最小字频阈值
|
||||
tone_error_rate: 声调错误概率
|
||||
word_replace_rate: 整词替换概率
|
||||
max_freq_diff: 最大允许的频率差异
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
print(f"参数 {key} 已设置为 {value}")
|
||||
else:
|
||||
print(f"警告: 参数 {key} 不存在")
|
||||
|
||||
|
||||
def main():
|
||||
# 创建错别字生成器实例
|
||||
typo_generator = ChineseTypoGenerator(error_rate=0.03, min_freq=7, tone_error_rate=0.02, word_replace_rate=0.3)
|
||||
|
||||
# 获取用户输入
|
||||
sentence = input("请输入中文句子:")
|
||||
|
||||
# 创建包含错别字的句子
|
||||
start_time = time.time()
|
||||
typo_sentence, correction_suggestion = typo_generator.create_typo_sentence(sentence)
|
||||
|
||||
# 打印结果
|
||||
print("\n原句:", sentence)
|
||||
print("错字版:", typo_sentence)
|
||||
|
||||
# 打印纠正建议
|
||||
if correction_suggestion:
|
||||
print("\n随机纠正建议:")
|
||||
print(f"应该改为:{correction_suggestion}")
|
||||
|
||||
# 计算并打印总耗时
|
||||
end_time = time.time()
|
||||
total_time = end_time - start_time
|
||||
print(f"\n总耗时:{total_time:.2f}秒")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
767
src/chat/utils/utils.py
Normal file
767
src/chat/utils/utils.py
Normal file
@@ -0,0 +1,767 @@
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
import time
|
||||
import jieba
|
||||
import numpy as np
|
||||
|
||||
from collections import Counter
|
||||
from maim_message import UserInfo
|
||||
from typing import Optional, Tuple, Dict, List, Any
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.common.message_repository import find_messages, count_messages
|
||||
from src.config.config import global_config, model_config
|
||||
from src.chat.message_receive.message import MessageRecv
|
||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.person_info.person_info import PersonInfoManager, get_person_info_manager
|
||||
from .typo_generator import ChineseTypoGenerator
|
||||
|
||||
logger = get_logger("chat_utils")
|
||||
|
||||
|
||||
def is_english_letter(char: str) -> bool:
|
||||
"""检查字符是否为英文字母(忽略大小写)"""
|
||||
return "a" <= char.lower() <= "z"
|
||||
|
||||
|
||||
def db_message_to_str(message_dict: dict) -> str:
|
||||
logger.debug(f"message_dict: {message_dict}")
|
||||
time_str = time.strftime("%m-%d %H:%M:%S", time.localtime(message_dict["time"]))
|
||||
try:
|
||||
name = f"[({message_dict['user_id']}){message_dict.get('user_nickname', '')}]{message_dict.get('user_cardname', '')}"
|
||||
except Exception:
|
||||
name = message_dict.get("user_nickname", "") or f"用户{message_dict['user_id']}"
|
||||
content = message_dict.get("processed_plain_text", "")
|
||||
result = f"[{time_str}] {name}: {content}\n"
|
||||
logger.debug(f"result: {result}")
|
||||
return result
|
||||
|
||||
|
||||
def is_mentioned_bot_in_message(message: MessageRecv) -> tuple[bool, float]:
|
||||
"""检查消息是否提到了机器人"""
|
||||
keywords = [global_config.bot.nickname]
|
||||
nicknames = global_config.bot.alias_names
|
||||
reply_probability = 0.0
|
||||
is_at = False
|
||||
is_mentioned = False
|
||||
if message.is_mentioned is not None:
|
||||
return bool(message.is_mentioned), message.is_mentioned
|
||||
if (
|
||||
message.message_info.additional_config is not None
|
||||
and message.message_info.additional_config.get("is_mentioned") is not None
|
||||
):
|
||||
try:
|
||||
reply_probability = float(message.message_info.additional_config.get("is_mentioned")) # type: ignore
|
||||
is_mentioned = True
|
||||
return is_mentioned, reply_probability
|
||||
except Exception as e:
|
||||
logger.warning(str(e))
|
||||
logger.warning(
|
||||
f"消息中包含不合理的设置 is_mentioned: {message.message_info.additional_config.get('is_mentioned')}"
|
||||
)
|
||||
|
||||
if global_config.bot.nickname in message.processed_plain_text:
|
||||
is_mentioned = True
|
||||
|
||||
for alias_name in global_config.bot.alias_names:
|
||||
if alias_name in message.processed_plain_text:
|
||||
is_mentioned = True
|
||||
|
||||
# 判断是否被@
|
||||
if re.search(rf"@<(.+?):{global_config.bot.qq_account}>", message.processed_plain_text):
|
||||
is_at = True
|
||||
is_mentioned = True
|
||||
|
||||
# print(f"message.processed_plain_text: {message.processed_plain_text}")
|
||||
# print(f"is_mentioned: {is_mentioned}")
|
||||
# print(f"is_at: {is_at}")
|
||||
|
||||
if is_at and global_config.chat.at_bot_inevitable_reply:
|
||||
reply_probability = 1.0
|
||||
logger.debug("被@,回复概率设置为100%")
|
||||
else:
|
||||
if not is_mentioned:
|
||||
# 判断是否被回复
|
||||
if re.match(
|
||||
rf"\[回复 (.+?)\({str(global_config.bot.qq_account)}\):(.+?)\],说:", message.processed_plain_text
|
||||
) or re.match(
|
||||
rf"\[回复<(.+?)(?=:{str(global_config.bot.qq_account)}>)\:{str(global_config.bot.qq_account)}>:(.+?)\],说:",
|
||||
message.processed_plain_text,
|
||||
):
|
||||
is_mentioned = True
|
||||
else:
|
||||
# 判断内容中是否被提及
|
||||
message_content = re.sub(r"@(.+?)((\d+))", "", message.processed_plain_text)
|
||||
message_content = re.sub(r"@<(.+?)(?=:(\d+))\:(\d+)>", "", message_content)
|
||||
message_content = re.sub(r"\[回复 (.+?)\(((\d+)|未知id)\):(.+?)\],说:", "", message_content)
|
||||
message_content = re.sub(r"\[回复<(.+?)(?=:(\d+))\:(\d+)>:(.+?)\],说:", "", message_content)
|
||||
for keyword in keywords:
|
||||
if keyword in message_content:
|
||||
is_mentioned = True
|
||||
for nickname in nicknames:
|
||||
if nickname in message_content:
|
||||
is_mentioned = True
|
||||
if is_mentioned and global_config.chat.mentioned_bot_inevitable_reply:
|
||||
reply_probability = 1.0
|
||||
logger.debug("被提及,回复概率设置为100%")
|
||||
return is_mentioned, reply_probability
|
||||
|
||||
|
||||
async def get_embedding(text, request_type="embedding") -> Optional[List[float]]:
|
||||
"""获取文本的embedding向量"""
|
||||
llm = LLMRequest(model_set=model_config.model_task_config.embedding, request_type=request_type)
|
||||
try:
|
||||
embedding, _ = await llm.get_embedding(text)
|
||||
except Exception as e:
|
||||
logger.error(f"获取embedding失败: {str(e)}")
|
||||
embedding = None
|
||||
return embedding
|
||||
|
||||
|
||||
def get_recent_group_speaker(chat_stream_id: str, sender, limit: int = 12) -> list:
|
||||
# 获取当前群聊记录内发言的人
|
||||
filter_query = {"chat_id": chat_stream_id}
|
||||
sort_order = [("time", -1)]
|
||||
recent_messages = find_messages(message_filter=filter_query, sort=sort_order, limit=limit)
|
||||
|
||||
if not recent_messages:
|
||||
return []
|
||||
|
||||
who_chat_in_group = []
|
||||
for msg_db_data in recent_messages:
|
||||
user_info = UserInfo.from_dict(
|
||||
{
|
||||
"platform": msg_db_data["user_platform"],
|
||||
"user_id": msg_db_data["user_id"],
|
||||
"user_nickname": msg_db_data["user_nickname"],
|
||||
"user_cardname": msg_db_data.get("user_cardname", ""),
|
||||
}
|
||||
)
|
||||
if (
|
||||
(user_info.platform, user_info.user_id) != sender
|
||||
and user_info.user_id != global_config.bot.qq_account
|
||||
and (user_info.platform, user_info.user_id, user_info.user_nickname) not in who_chat_in_group
|
||||
and len(who_chat_in_group) < 5
|
||||
): # 排除重复,排除消息发送者,排除bot,限制加载的关系数目
|
||||
who_chat_in_group.append((user_info.platform, user_info.user_id, user_info.user_nickname))
|
||||
|
||||
return who_chat_in_group
|
||||
|
||||
|
||||
def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
|
||||
"""将文本分割成句子,并根据概率合并
|
||||
1. 识别分割点(, , 。 ; 空格),但如果分割点左右都是英文字母则不分割。
|
||||
2. 将文本分割成 (内容, 分隔符) 的元组。
|
||||
3. 根据原始文本长度计算合并概率,概率性地合并相邻段落。
|
||||
注意:此函数假定颜文字已在上层被保护。
|
||||
Args:
|
||||
text: 要分割的文本字符串 (假定颜文字已被保护)
|
||||
Returns:
|
||||
List[str]: 分割和合并后的句子列表
|
||||
"""
|
||||
# 预处理:处理多余的换行符
|
||||
# 1. 将连续的换行符替换为单个换行符
|
||||
text = re.sub(r"\n\s*\n+", "\n", text)
|
||||
# 2. 处理换行符和其他分隔符的组合
|
||||
text = re.sub(r"\n\s*([,,。;\s])", r"\1", text)
|
||||
text = re.sub(r"([,,。;\s])\s*\n", r"\1", text)
|
||||
|
||||
# 处理两个汉字中间的换行符
|
||||
text = re.sub(r"([\u4e00-\u9fff])\n([\u4e00-\u9fff])", r"\1。\2", text)
|
||||
|
||||
len_text = len(text)
|
||||
if len_text < 3:
|
||||
return list(text) if random.random() < 0.01 else [text]
|
||||
|
||||
# 定义分隔符
|
||||
separators = {",", ",", " ", "。", ";"}
|
||||
segments = []
|
||||
current_segment = ""
|
||||
|
||||
# 1. 分割成 (内容, 分隔符) 元组
|
||||
i = 0
|
||||
while i < len(text):
|
||||
char = text[i]
|
||||
if char in separators:
|
||||
# 检查分割条件:如果分隔符左右都是英文字母,则不分割
|
||||
can_split = True
|
||||
if 0 < i < len(text) - 1:
|
||||
prev_char = text[i - 1]
|
||||
next_char = text[i + 1]
|
||||
# if is_english_letter(prev_char) and is_english_letter(next_char) and char == ' ': # 原计划只对空格应用此规则,现应用于所有分隔符
|
||||
if is_english_letter(prev_char) and is_english_letter(next_char):
|
||||
can_split = False
|
||||
|
||||
if can_split:
|
||||
# 只有当当前段不为空时才添加
|
||||
if current_segment:
|
||||
segments.append((current_segment, char))
|
||||
# 如果当前段为空,但分隔符是空格,则也添加一个空段(保留空格)
|
||||
elif char == " ":
|
||||
segments.append(("", char))
|
||||
current_segment = ""
|
||||
else:
|
||||
# 不分割,将分隔符加入当前段
|
||||
current_segment += char
|
||||
else:
|
||||
current_segment += char
|
||||
i += 1
|
||||
|
||||
# 添加最后一个段(没有后续分隔符)
|
||||
if current_segment:
|
||||
segments.append((current_segment, ""))
|
||||
|
||||
# 过滤掉完全空的段(内容和分隔符都为空)
|
||||
segments = [(content, sep) for content, sep in segments if content or sep]
|
||||
|
||||
# 如果分割后为空(例如,输入全是分隔符且不满足保留条件),恢复颜文字并返回
|
||||
if not segments:
|
||||
return [text] if text else [] # 如果原始文本非空,则返回原始文本(可能只包含未被分割的字符或颜文字占位符)
|
||||
|
||||
# 2. 概率合并
|
||||
if len_text < 12:
|
||||
split_strength = 0.2
|
||||
elif len_text < 32:
|
||||
split_strength = 0.6
|
||||
else:
|
||||
split_strength = 0.7
|
||||
# 合并概率与分割强度相反
|
||||
merge_probability = 1.0 - split_strength
|
||||
|
||||
merged_segments = []
|
||||
idx = 0
|
||||
while idx < len(segments):
|
||||
current_content, current_sep = segments[idx]
|
||||
|
||||
# 检查是否可以与下一段合并
|
||||
# 条件:不是最后一段,且随机数小于合并概率,且当前段有内容(避免合并空段)
|
||||
if idx + 1 < len(segments) and random.random() < merge_probability and current_content:
|
||||
next_content, next_sep = segments[idx + 1]
|
||||
# 合并: (内容1 + 分隔符1 + 内容2, 分隔符2)
|
||||
# 只有当下一段也有内容时才合并文本,否则只传递分隔符
|
||||
if next_content:
|
||||
merged_content = current_content + current_sep + next_content
|
||||
merged_segments.append((merged_content, next_sep))
|
||||
else: # 下一段内容为空,只保留当前内容和下一段的分隔符
|
||||
merged_segments.append((current_content, next_sep))
|
||||
|
||||
idx += 2 # 跳过下一段,因为它已被合并
|
||||
else:
|
||||
# 不合并,直接添加当前段
|
||||
merged_segments.append((current_content, current_sep))
|
||||
idx += 1
|
||||
|
||||
# 提取最终的句子内容
|
||||
final_sentences = [content for content, sep in merged_segments if content] # 只保留有内容的段
|
||||
|
||||
# 清理可能引入的空字符串和仅包含空白的字符串
|
||||
final_sentences = [
|
||||
s for s in final_sentences if s.strip()
|
||||
] # 过滤掉空字符串以及仅包含空白(如换行符、空格)的字符串
|
||||
|
||||
logger.debug(f"分割并合并后的句子: {final_sentences}")
|
||||
return final_sentences
|
||||
|
||||
|
||||
def random_remove_punctuation(text: str) -> str:
|
||||
"""随机处理标点符号,模拟人类打字习惯
|
||||
|
||||
Args:
|
||||
text: 要处理的文本
|
||||
|
||||
Returns:
|
||||
str: 处理后的文本
|
||||
"""
|
||||
result = ""
|
||||
text_len = len(text)
|
||||
|
||||
for i, char in enumerate(text):
|
||||
if char == "。" and i == text_len - 1: # 结尾的句号
|
||||
if random.random() > 0.1: # 90%概率删除结尾句号
|
||||
continue
|
||||
elif char == ",":
|
||||
rand = random.random()
|
||||
if rand < 0.05: # 5%概率删除逗号
|
||||
continue
|
||||
elif rand < 0.25: # 20%概率把逗号变成空格
|
||||
result += " "
|
||||
continue
|
||||
result += char
|
||||
return result
|
||||
|
||||
|
||||
def process_llm_response(text: str, enable_splitter: bool = True, enable_chinese_typo: bool = True) -> list[str]:
|
||||
if not global_config.response_post_process.enable_response_post_process:
|
||||
return [text]
|
||||
|
||||
# 先保护颜文字
|
||||
if global_config.response_splitter.enable_kaomoji_protection:
|
||||
protected_text, kaomoji_mapping = protect_kaomoji(text)
|
||||
logger.debug(f"保护颜文字后的文本: {protected_text}")
|
||||
else:
|
||||
protected_text = text
|
||||
kaomoji_mapping = {}
|
||||
# 提取被 () 或 [] 或 ()包裹且包含中文的内容
|
||||
pattern = re.compile(r"[(\[(](?=.*[一-鿿]).*?[)\])]")
|
||||
_extracted_contents = pattern.findall(protected_text) # 在保护后的文本上查找
|
||||
# 去除 () 和 [] 及其包裹的内容
|
||||
cleaned_text = pattern.sub("", protected_text)
|
||||
|
||||
if cleaned_text == "":
|
||||
return ["呃呃"]
|
||||
|
||||
logger.debug(f"{text}去除括号处理后的文本: {cleaned_text}")
|
||||
|
||||
# 对清理后的文本进行进一步处理
|
||||
max_length = global_config.response_splitter.max_length * 2
|
||||
max_sentence_num = global_config.response_splitter.max_sentence_num
|
||||
# 如果基本上是中文,则进行长度过滤
|
||||
if get_western_ratio(cleaned_text) < 0.1 and len(cleaned_text) > max_length:
|
||||
logger.warning(f"回复过长 ({len(cleaned_text)} 字符),返回默认回复")
|
||||
return ["懒得说"]
|
||||
|
||||
typo_generator = ChineseTypoGenerator(
|
||||
error_rate=global_config.chinese_typo.error_rate,
|
||||
min_freq=global_config.chinese_typo.min_freq,
|
||||
tone_error_rate=global_config.chinese_typo.tone_error_rate,
|
||||
word_replace_rate=global_config.chinese_typo.word_replace_rate,
|
||||
)
|
||||
|
||||
if global_config.response_splitter.enable and enable_splitter:
|
||||
split_sentences = split_into_sentences_w_remove_punctuation(cleaned_text)
|
||||
else:
|
||||
split_sentences = [cleaned_text]
|
||||
|
||||
sentences = []
|
||||
for sentence in split_sentences:
|
||||
if global_config.chinese_typo.enable and enable_chinese_typo:
|
||||
typoed_text, typo_corrections = typo_generator.create_typo_sentence(sentence)
|
||||
sentences.append(typoed_text)
|
||||
if typo_corrections:
|
||||
sentences.append(typo_corrections)
|
||||
else:
|
||||
sentences.append(sentence)
|
||||
|
||||
if len(sentences) > max_sentence_num:
|
||||
logger.warning(f"分割后消息数量过多 ({len(sentences)} 条),返回默认回复")
|
||||
return [f"{global_config.bot.nickname}不知道哦"]
|
||||
|
||||
# if extracted_contents:
|
||||
# for content in extracted_contents:
|
||||
# sentences.append(content)
|
||||
|
||||
# 在所有句子处理完毕后,对包含占位符的列表进行恢复
|
||||
if global_config.response_splitter.enable_kaomoji_protection:
|
||||
sentences = recover_kaomoji(sentences, kaomoji_mapping)
|
||||
|
||||
return sentences
|
||||
|
||||
|
||||
def calculate_typing_time(
|
||||
input_string: str,
|
||||
thinking_start_time: float,
|
||||
chinese_time: float = 0.3,
|
||||
english_time: float = 0.15,
|
||||
is_emoji: bool = False,
|
||||
) -> float:
|
||||
"""
|
||||
计算输入字符串所需的时间,中文和英文字符有不同的输入时间
|
||||
input_string (str): 输入的字符串
|
||||
chinese_time (float): 中文字符的输入时间,默认为0.2秒
|
||||
english_time (float): 英文字符的输入时间,默认为0.1秒
|
||||
is_emoji (bool): 是否为emoji,默认为False
|
||||
|
||||
特殊情况:
|
||||
- 如果只有一个中文字符,将使用3倍的中文输入时间
|
||||
- 在所有输入结束后,额外加上回车时间0.3秒
|
||||
- 如果is_emoji为True,将使用固定1秒的输入时间
|
||||
"""
|
||||
# # 将0-1的唤醒度映射到-1到1
|
||||
# mood_arousal = mood_manager.current_mood.arousal
|
||||
# # 映射到0.5到2倍的速度系数
|
||||
# typing_speed_multiplier = 1.5**mood_arousal # 唤醒度为1时速度翻倍,为-1时速度减半
|
||||
# chinese_time *= 1 / typing_speed_multiplier
|
||||
# english_time *= 1 / typing_speed_multiplier
|
||||
# 计算中文字符数
|
||||
chinese_chars = sum("\u4e00" <= char <= "\u9fff" for char in input_string)
|
||||
|
||||
# 如果只有一个中文字符,使用3倍时间
|
||||
if chinese_chars == 1 and len(input_string.strip()) == 1:
|
||||
return chinese_time * 3 + 0.3 # 加上回车时间
|
||||
|
||||
# 正常计算所有字符的输入时间
|
||||
total_time = 0.0
|
||||
for char in input_string:
|
||||
total_time += chinese_time if "\u4e00" <= char <= "\u9fff" else english_time
|
||||
if is_emoji:
|
||||
total_time = 1
|
||||
|
||||
if time.time() - thinking_start_time > 10:
|
||||
total_time = 1
|
||||
|
||||
# print(f"thinking_start_time:{thinking_start_time}")
|
||||
# print(f"nowtime:{time.time()}")
|
||||
# print(f"nowtime - thinking_start_time:{time.time() - thinking_start_time}")
|
||||
# print(f"{total_time}")
|
||||
|
||||
return total_time # 加上回车时间
|
||||
|
||||
|
||||
def cosine_similarity(v1, v2):
|
||||
"""计算余弦相似度"""
|
||||
dot_product = np.dot(v1, v2)
|
||||
norm1 = np.linalg.norm(v1)
|
||||
norm2 = np.linalg.norm(v2)
|
||||
return 0 if norm1 == 0 or norm2 == 0 else dot_product / (norm1 * norm2)
|
||||
|
||||
|
||||
def text_to_vector(text):
|
||||
"""将文本转换为词频向量"""
|
||||
# 分词
|
||||
words = jieba.lcut(text)
|
||||
return Counter(words)
|
||||
|
||||
|
||||
def find_similar_topics_simple(text: str, topics: list, top_k: int = 5) -> list:
|
||||
"""使用简单的余弦相似度计算文本相似度"""
|
||||
# 将输入文本转换为词频向量
|
||||
text_vector = text_to_vector(text)
|
||||
|
||||
# 计算每个主题的相似度
|
||||
similarities = []
|
||||
for topic in topics:
|
||||
topic_vector = text_to_vector(topic)
|
||||
# 获取所有唯一词
|
||||
all_words = set(text_vector.keys()) | set(topic_vector.keys())
|
||||
# 构建向量
|
||||
v1 = [text_vector.get(word, 0) for word in all_words]
|
||||
v2 = [topic_vector.get(word, 0) for word in all_words]
|
||||
# 计算相似度
|
||||
similarity = cosine_similarity(v1, v2)
|
||||
similarities.append((topic, similarity))
|
||||
|
||||
# 按相似度降序排序并返回前k个
|
||||
return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]
|
||||
|
||||
|
||||
def truncate_message(message: str, max_length=20) -> str:
|
||||
"""截断消息,使其不超过指定长度"""
|
||||
return f"{message[:max_length]}..." if len(message) > max_length else message
|
||||
|
||||
|
||||
def protect_kaomoji(sentence):
|
||||
""" "
|
||||
识别并保护句子中的颜文字(含括号与无括号),将其替换为占位符,
|
||||
并返回替换后的句子和占位符到颜文字的映射表。
|
||||
Args:
|
||||
sentence (str): 输入的原始句子
|
||||
Returns:
|
||||
tuple: (处理后的句子, {占位符: 颜文字})
|
||||
"""
|
||||
kaomoji_pattern = re.compile(
|
||||
r"("
|
||||
r"[(\[(【]" # 左括号
|
||||
r"[^()\[\]()【】]*?" # 非括号字符(惰性匹配)
|
||||
r"[^一-龥a-zA-Z0-9\s]" # 非中文、非英文、非数字、非空格字符(必须包含至少一个)
|
||||
r"[^()\[\]()【】]*?" # 非括号字符(惰性匹配)
|
||||
r"[)\])】" # 右括号
|
||||
r"]"
|
||||
r")"
|
||||
r"|"
|
||||
r"([▼▽・ᴥω・﹏^><≧≦ ̄`´∀ヮДд︿﹀へ。゚╥╯╰︶︹•⁄]{2,15})"
|
||||
)
|
||||
|
||||
kaomoji_matches = kaomoji_pattern.findall(sentence)
|
||||
placeholder_to_kaomoji = {}
|
||||
|
||||
for idx, match in enumerate(kaomoji_matches):
|
||||
kaomoji = match[0] or match[1]
|
||||
placeholder = f"__KAOMOJI_{idx}__"
|
||||
sentence = sentence.replace(kaomoji, placeholder, 1)
|
||||
placeholder_to_kaomoji[placeholder] = kaomoji
|
||||
|
||||
return sentence, placeholder_to_kaomoji
|
||||
|
||||
|
||||
def recover_kaomoji(sentences, placeholder_to_kaomoji):
|
||||
"""
|
||||
根据映射表恢复句子中的颜文字。
|
||||
Args:
|
||||
sentences (list): 含有占位符的句子列表
|
||||
placeholder_to_kaomoji (dict): 占位符到颜文字的映射表
|
||||
Returns:
|
||||
list: 恢复颜文字后的句子列表
|
||||
"""
|
||||
recovered_sentences = []
|
||||
for sentence in sentences:
|
||||
for placeholder, kaomoji in placeholder_to_kaomoji.items():
|
||||
sentence = sentence.replace(placeholder, kaomoji)
|
||||
recovered_sentences.append(sentence)
|
||||
return recovered_sentences
|
||||
|
||||
|
||||
def get_western_ratio(paragraph):
|
||||
"""计算段落中字母数字字符的西文比例
|
||||
原理:检查段落中字母数字字符的西文比例
|
||||
通过is_english_letter函数判断每个字符是否为西文
|
||||
只检查字母数字字符,忽略标点符号和空格等非字母数字字符
|
||||
|
||||
Args:
|
||||
paragraph: 要检查的文本段落
|
||||
|
||||
Returns:
|
||||
float: 西文字符比例(0.0-1.0),如果没有字母数字字符则返回0.0
|
||||
"""
|
||||
alnum_chars = [char for char in paragraph if char.isalnum()]
|
||||
if not alnum_chars:
|
||||
return 0.0
|
||||
|
||||
western_count = sum(bool(is_english_letter(char)) for char in alnum_chars)
|
||||
return western_count / len(alnum_chars)
|
||||
|
||||
|
||||
def count_messages_between(start_time: float, end_time: float, stream_id: str) -> tuple[int, int]:
|
||||
"""计算两个时间点之间的消息数量和文本总长度
|
||||
|
||||
Args:
|
||||
start_time (float): 起始时间戳 (不包含)
|
||||
end_time (float): 结束时间戳 (包含)
|
||||
stream_id (str): 聊天流ID
|
||||
|
||||
Returns:
|
||||
tuple[int, int]: (消息数量, 文本总长度)
|
||||
"""
|
||||
count = 0
|
||||
total_length = 0
|
||||
|
||||
# 参数校验 (可选但推荐)
|
||||
if start_time >= end_time:
|
||||
# logger.debug(f"开始时间 {start_time} 大于或等于结束时间 {end_time},返回 0, 0")
|
||||
return 0, 0
|
||||
if not stream_id:
|
||||
logger.error("stream_id 不能为空")
|
||||
return 0, 0
|
||||
|
||||
# 使用message_repository中的count_messages和find_messages函数
|
||||
|
||||
# 构建查询条件
|
||||
filter_query = {"chat_id": stream_id, "time": {"$gt": start_time, "$lte": end_time}}
|
||||
|
||||
try:
|
||||
# 先获取消息数量
|
||||
count = count_messages(filter_query)
|
||||
|
||||
# 获取消息内容计算总长度
|
||||
messages = find_messages(message_filter=filter_query)
|
||||
total_length = sum(len(msg.get("processed_plain_text", "")) for msg in messages)
|
||||
|
||||
return count, total_length
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"计算消息数量时发生意外错误: {e}")
|
||||
return 0, 0
|
||||
|
||||
|
||||
def translate_timestamp_to_human_readable(timestamp: float, mode: str = "normal") -> str:
|
||||
# sourcery skip: merge-comparisons, merge-duplicate-blocks, switch
|
||||
"""将时间戳转换为人类可读的时间格式
|
||||
|
||||
Args:
|
||||
timestamp: 时间戳
|
||||
mode: 转换模式,"normal"为标准格式,"relative"为相对时间格式
|
||||
|
||||
Returns:
|
||||
str: 格式化后的时间字符串
|
||||
"""
|
||||
if mode == "normal":
|
||||
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
|
||||
elif mode == "normal_no_YMD":
|
||||
return time.strftime("%H:%M:%S", time.localtime(timestamp))
|
||||
elif mode == "relative":
|
||||
now = time.time()
|
||||
diff = now - timestamp
|
||||
|
||||
if diff < 20:
|
||||
return "刚刚"
|
||||
elif diff < 60:
|
||||
return f"{int(diff)}秒前"
|
||||
elif diff < 3600:
|
||||
return f"{int(diff / 60)}分钟前"
|
||||
elif diff < 86400:
|
||||
return f"{int(diff / 3600)}小时前"
|
||||
elif diff < 86400 * 2:
|
||||
return f"{int(diff / 86400)}天前"
|
||||
else:
|
||||
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) + ":"
|
||||
else: # mode = "lite" or unknown
|
||||
# 只返回时分秒格式
|
||||
return time.strftime("%H:%M:%S", time.localtime(timestamp))
|
||||
|
||||
|
||||
def get_chat_type_and_target_info(chat_id: str) -> Tuple[bool, Optional[Dict]]:
|
||||
"""
|
||||
获取聊天类型(是否群聊)和私聊对象信息。
|
||||
|
||||
Args:
|
||||
chat_id: 聊天流ID
|
||||
|
||||
Returns:
|
||||
Tuple[bool, Optional[Dict]]:
|
||||
- bool: 是否为群聊 (True 是群聊, False 是私聊或未知)
|
||||
- Optional[Dict]: 如果是私聊,包含对方信息的字典;否则为 None。
|
||||
字典包含: platform, user_id, user_nickname, person_id, person_name
|
||||
"""
|
||||
is_group_chat = False # Default to private/unknown
|
||||
chat_target_info = None
|
||||
|
||||
try:
|
||||
if chat_stream := get_chat_manager().get_stream(chat_id):
|
||||
if chat_stream.group_info:
|
||||
is_group_chat = True
|
||||
chat_target_info = None # Explicitly None for group chat
|
||||
elif chat_stream.user_info: # It's a private chat
|
||||
is_group_chat = False
|
||||
user_info = chat_stream.user_info
|
||||
platform: str = chat_stream.platform
|
||||
user_id: str = user_info.user_id # type: ignore
|
||||
|
||||
# Initialize target_info with basic info
|
||||
target_info = {
|
||||
"platform": platform,
|
||||
"user_id": user_id,
|
||||
"user_nickname": user_info.user_nickname,
|
||||
"person_id": None,
|
||||
"person_name": None,
|
||||
}
|
||||
|
||||
# Try to fetch person info
|
||||
try:
|
||||
# Assume get_person_id is sync (as per original code), keep using to_thread
|
||||
person_id = PersonInfoManager.get_person_id(platform, user_id)
|
||||
person_name = None
|
||||
if person_id:
|
||||
# get_value is async, so await it directly
|
||||
person_info_manager = get_person_info_manager()
|
||||
person_name = person_info_manager.get_value_sync(person_id, "person_name")
|
||||
|
||||
target_info["person_id"] = person_id
|
||||
target_info["person_name"] = person_name
|
||||
except Exception as person_e:
|
||||
logger.warning(
|
||||
f"获取 person_id 或 person_name 时出错 for {platform}:{user_id} in utils: {person_e}"
|
||||
)
|
||||
|
||||
chat_target_info = target_info
|
||||
else:
|
||||
logger.warning(f"无法获取 chat_stream for {chat_id} in utils")
|
||||
except Exception as e:
|
||||
logger.error(f"获取聊天类型和目标信息时出错 for {chat_id}: {e}", exc_info=True)
|
||||
# Keep defaults on error
|
||||
|
||||
return is_group_chat, chat_target_info
|
||||
|
||||
|
||||
def assign_message_ids(messages: List[Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
为消息列表中的每个消息分配唯一的简短随机ID
|
||||
|
||||
Args:
|
||||
messages: 消息列表
|
||||
|
||||
Returns:
|
||||
包含 {'id': str, 'message': any} 格式的字典列表
|
||||
"""
|
||||
result = []
|
||||
used_ids = set()
|
||||
len_i = len(messages)
|
||||
if len_i > 100:
|
||||
a = 10
|
||||
b = 99
|
||||
else:
|
||||
a = 1
|
||||
b = 9
|
||||
|
||||
for i, message in enumerate(messages):
|
||||
# 生成唯一的简短ID
|
||||
while True:
|
||||
# 使用索引+随机数生成简短ID
|
||||
random_suffix = random.randint(a, b)
|
||||
message_id = f"m{i+1}{random_suffix}"
|
||||
|
||||
if message_id not in used_ids:
|
||||
used_ids.add(message_id)
|
||||
break
|
||||
|
||||
result.append({
|
||||
'id': message_id,
|
||||
'message': message
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def assign_message_ids_flexible(
|
||||
messages: list,
|
||||
prefix: str = "msg",
|
||||
id_length: int = 6,
|
||||
use_timestamp: bool = False
|
||||
) -> list:
|
||||
"""
|
||||
为消息列表中的每个消息分配唯一的简短随机ID(增强版)
|
||||
|
||||
Args:
|
||||
messages: 消息列表
|
||||
prefix: ID前缀,默认为"msg"
|
||||
id_length: ID的总长度(不包括前缀),默认为6
|
||||
use_timestamp: 是否在ID中包含时间戳,默认为False
|
||||
|
||||
Returns:
|
||||
包含 {'id': str, 'message': any} 格式的字典列表
|
||||
"""
|
||||
result = []
|
||||
used_ids = set()
|
||||
|
||||
for i, message in enumerate(messages):
|
||||
# 生成唯一的ID
|
||||
while True:
|
||||
if use_timestamp:
|
||||
# 使用时间戳的后几位 + 随机字符
|
||||
timestamp_suffix = str(int(time.time() * 1000))[-3:]
|
||||
remaining_length = id_length - 3
|
||||
random_chars = ''.join(random.choices(string.ascii_lowercase + string.digits, k=remaining_length))
|
||||
message_id = f"{prefix}{timestamp_suffix}{random_chars}"
|
||||
else:
|
||||
# 使用索引 + 随机字符
|
||||
index_str = str(i + 1)
|
||||
remaining_length = max(1, id_length - len(index_str))
|
||||
random_chars = ''.join(random.choices(string.ascii_lowercase + string.digits, k=remaining_length))
|
||||
message_id = f"{prefix}{index_str}{random_chars}"
|
||||
|
||||
if message_id not in used_ids:
|
||||
used_ids.add(message_id)
|
||||
break
|
||||
|
||||
result.append({
|
||||
'id': message_id,
|
||||
'message': message
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# 使用示例:
|
||||
# messages = ["Hello", "World", "Test message"]
|
||||
#
|
||||
# # 基础版本
|
||||
# result1 = assign_message_ids(messages)
|
||||
# # 结果: [{'id': 'm1123', 'message': 'Hello'}, {'id': 'm2456', 'message': 'World'}, {'id': 'm3789', 'message': 'Test message'}]
|
||||
#
|
||||
# # 增强版本 - 自定义前缀和长度
|
||||
# result2 = assign_message_ids_flexible(messages, prefix="chat", id_length=8)
|
||||
# # 结果: [{'id': 'chat1abc2', 'message': 'Hello'}, {'id': 'chat2def3', 'message': 'World'}, {'id': 'chat3ghi4', 'message': 'Test message'}]
|
||||
#
|
||||
# # 增强版本 - 使用时间戳
|
||||
# result3 = assign_message_ids_flexible(messages, prefix="ts", use_timestamp=True)
|
||||
# # 结果: [{'id': 'ts123a1b', 'message': 'Hello'}, {'id': 'ts123c2d', 'message': 'World'}, {'id': 'ts123e3f', 'message': 'Test message'}]
|
||||
659
src/chat/utils/utils_image.py
Normal file
659
src/chat/utils/utils_image.py
Normal file
@@ -0,0 +1,659 @@
|
||||
import base64
|
||||
import os
|
||||
import time
|
||||
import hashlib
|
||||
import uuid
|
||||
import io
|
||||
import asyncio
|
||||
import numpy as np
|
||||
|
||||
from typing import Optional, Tuple
|
||||
from PIL import Image
|
||||
from rich.traceback import install
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.common.database.database import db
|
||||
from src.common.database.sqlalchemy_models import Images, ImageDescriptions
|
||||
from src.config.config import global_config, model_config
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.common.database.sqlalchemy_models import get_db_session
|
||||
|
||||
from sqlalchemy import select, and_
|
||||
install(extra_lines=3)
|
||||
|
||||
logger = get_logger("chat_image")
|
||||
|
||||
|
||||
class ImageManager:
|
||||
_instance = None
|
||||
IMAGE_DIR = "data" # 图像存储根目录
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._instance._initialized = False
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if not self._initialized:
|
||||
self._ensure_image_dir()
|
||||
|
||||
self._initialized = True
|
||||
self.vlm = LLMRequest(model_set=model_config.model_task_config.vlm, request_type="image")
|
||||
|
||||
try:
|
||||
db.connect(reuse_if_open=True)
|
||||
# 使用SQLAlchemy创建表已在初始化时完成
|
||||
logger.debug("使用SQLAlchemy进行表管理")
|
||||
except Exception as e:
|
||||
logger.error(f"数据库连接失败: {e}")
|
||||
|
||||
self._initialized = True
|
||||
|
||||
def _ensure_image_dir(self):
|
||||
"""确保图像存储目录存在"""
|
||||
os.makedirs(self.IMAGE_DIR, exist_ok=True)
|
||||
|
||||
@staticmethod
|
||||
def _get_description_from_db(image_hash: str, description_type: str) -> Optional[str]:
|
||||
"""从数据库获取图片描述
|
||||
|
||||
Args:
|
||||
image_hash: 图片哈希值
|
||||
description_type: 描述类型 ('emoji' 或 'image')
|
||||
|
||||
Returns:
|
||||
Optional[str]: 描述文本,如果不存在则返回None
|
||||
"""
|
||||
try:
|
||||
with get_db_session() as session:
|
||||
record = session.execute(select(ImageDescriptions).where(
|
||||
and_(ImageDescriptions.image_description_hash == image_hash, ImageDescriptions.type == description_type)
|
||||
)).scalar()
|
||||
return record.description if record else None
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库获取描述失败 (SQLAlchemy): {str(e)}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _save_description_to_db(image_hash: str, description: str, description_type: str) -> None:
|
||||
"""保存图片描述到数据库
|
||||
|
||||
Args:
|
||||
image_hash: 图片哈希值
|
||||
description: 描述文本
|
||||
description_type: 描述类型 ('emoji' 或 'image')
|
||||
"""
|
||||
try:
|
||||
current_timestamp = time.time()
|
||||
with get_db_session() as session:
|
||||
# 查找现有记录
|
||||
existing = session.execute(select(ImageDescriptions).where(
|
||||
and_(ImageDescriptions.image_description_hash == image_hash, ImageDescriptions.type == description_type)
|
||||
)).scalar()
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
existing.description = description
|
||||
existing.timestamp = current_timestamp
|
||||
else:
|
||||
# 创建新记录
|
||||
new_desc = ImageDescriptions(
|
||||
image_description_hash=image_hash,
|
||||
type=description_type,
|
||||
description=description,
|
||||
timestamp=current_timestamp
|
||||
)
|
||||
session.add(new_desc)
|
||||
# session.commit() 会在上下文管理器中自动调用
|
||||
except Exception as e:
|
||||
logger.error(f"保存描述到数据库失败 (SQLAlchemy): {str(e)}")
|
||||
|
||||
async def get_emoji_tag(self, image_base64: str) -> str:
|
||||
from src.chat.emoji_system.emoji_manager import get_emoji_manager
|
||||
emoji_manager = get_emoji_manager()
|
||||
if isinstance(image_base64, str):
|
||||
image_base64 = image_base64.encode("ascii", errors="ignore").decode("ascii")
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()
|
||||
emoji = await emoji_manager.get_emoji_from_manager(image_hash)
|
||||
emotion_list = emoji.emotion
|
||||
tag_str = ",".join(emotion_list)
|
||||
return f"[表情包:{tag_str}]"
|
||||
|
||||
async def get_emoji_description(self, image_base64: str) -> str:
|
||||
"""获取表情包描述,优先使用Emoji表中的缓存数据"""
|
||||
try:
|
||||
# 计算图片哈希
|
||||
# 确保base64字符串只包含ASCII字符
|
||||
if isinstance(image_base64, str):
|
||||
image_base64 = image_base64.encode("ascii", errors="ignore").decode("ascii")
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()
|
||||
image_format = Image.open(io.BytesIO(image_bytes)).format.lower() # type: ignore
|
||||
|
||||
# 优先使用EmojiManager查询已注册表情包的描述
|
||||
try:
|
||||
from src.chat.emoji_system.emoji_manager import get_emoji_manager
|
||||
emoji_manager = get_emoji_manager()
|
||||
cached_emoji_description = await emoji_manager.get_emoji_description_by_hash(image_hash)
|
||||
if cached_emoji_description:
|
||||
logger.info(f"[缓存命中] 使用已注册表情包描述: {cached_emoji_description[:50]}...")
|
||||
return cached_emoji_description
|
||||
except Exception as e:
|
||||
logger.debug(f"查询EmojiManager时出错: {e}")
|
||||
|
||||
# 查询ImageDescriptions表的缓存描述
|
||||
if cached_description := self._get_description_from_db(image_hash, "emoji"):
|
||||
logger.info(f"[缓存命中] 使用ImageDescriptions表中的描述: {cached_description[:50]}...")
|
||||
return f"[表情包:{cached_description}]"
|
||||
|
||||
# === 二步走识别流程 ===
|
||||
|
||||
# 第一步:VLM视觉分析 - 生成详细描述
|
||||
if image_format in ["gif", "GIF"]:
|
||||
image_base64_processed = self.transform_gif(image_base64)
|
||||
if image_base64_processed is None:
|
||||
logger.warning("GIF转换失败,无法获取描述")
|
||||
return "[表情包(GIF处理失败)]"
|
||||
vlm_prompt = "这是一个动态图表情包,每一张图代表了动态图的某一帧,黑色背景代表透明,描述一下表情包表达的情感和内容,描述细节,从互联网梗,meme的角度去分析"
|
||||
detailed_description, _ = await self.vlm.generate_response_for_image(
|
||||
vlm_prompt, image_base64_processed, "jpg", temperature=0.4, max_tokens=300
|
||||
)
|
||||
else:
|
||||
vlm_prompt = (
|
||||
"这是一个表情包,请详细描述一下表情包所表达的情感和内容,描述细节,从互联网梗,meme的角度去分析"
|
||||
)
|
||||
detailed_description, _ = await self.vlm.generate_response_for_image(
|
||||
vlm_prompt, image_base64, image_format, temperature=0.4, max_tokens=300
|
||||
)
|
||||
|
||||
if detailed_description is None:
|
||||
logger.warning("VLM未能生成表情包详细描述")
|
||||
return "[表情包(VLM描述生成失败)]"
|
||||
|
||||
# 第二步:LLM情感分析 - 基于详细描述生成简短的情感标签
|
||||
emotion_prompt = f"""
|
||||
请你基于这个表情包的详细描述,提取出最核心的情感含义,用1-2个词概括。
|
||||
详细描述:'{detailed_description}'
|
||||
|
||||
要求:
|
||||
1. 只输出1-2个最核心的情感词汇
|
||||
2. 从互联网梗、meme的角度理解
|
||||
3. 输出简短精准,不要解释
|
||||
4. 如果有多个词用逗号分隔
|
||||
"""
|
||||
|
||||
# 使用较低温度确保输出稳定
|
||||
emotion_llm = LLMRequest(model_set=model_config.model_task_config.utils, request_type="emoji")
|
||||
emotion_result, _ = await emotion_llm.generate_response_async(
|
||||
emotion_prompt, temperature=0.3, max_tokens=50
|
||||
)
|
||||
|
||||
if emotion_result is None:
|
||||
logger.warning("LLM未能生成情感标签,使用详细描述的前几个词")
|
||||
# 降级处理:从详细描述中提取关键词
|
||||
import jieba
|
||||
|
||||
words = list(jieba.cut(detailed_description))
|
||||
emotion_result = ",".join(words[:2]) if len(words) >= 2 else (words[0] if words else "表情")
|
||||
|
||||
# 处理情感结果,取前1-2个最重要的标签
|
||||
emotions = [e.strip() for e in emotion_result.replace(",", ",").split(",") if e.strip()]
|
||||
final_emotion = emotions[0] if emotions else "表情"
|
||||
|
||||
# 如果有第二个情感且不重复,也包含进来
|
||||
if len(emotions) > 1 and emotions[1] != emotions[0]:
|
||||
final_emotion = f"{emotions[0]},{emotions[1]}"
|
||||
|
||||
logger.info(f"[emoji识别] 详细描述: {detailed_description[:50]}... -> 情感标签: {final_emotion}")
|
||||
|
||||
if cached_description := self._get_description_from_db(image_hash, "emoji"):
|
||||
logger.warning(f"虽然生成了描述,但是找到缓存表情包描述: {cached_description}")
|
||||
return f"[表情包:{cached_description}]"
|
||||
|
||||
# 保存表情包文件和元数据(用于可能的后续分析)
|
||||
logger.debug(f"保存表情包: {image_hash}")
|
||||
current_timestamp = time.time()
|
||||
filename = f"{int(current_timestamp)}_{image_hash[:8]}.{image_format}"
|
||||
emoji_dir = os.path.join(self.IMAGE_DIR, "emoji")
|
||||
os.makedirs(emoji_dir, exist_ok=True)
|
||||
file_path = os.path.join(emoji_dir, filename)
|
||||
|
||||
try:
|
||||
# 保存文件
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
|
||||
# 保存到数据库 (Images表) - 包含详细描述用于可能的注册流程
|
||||
try:
|
||||
from src.common.database.sqlalchemy_models import get_db_session
|
||||
with get_db_session() as session:
|
||||
existing_img = session.execute(select(Images).where(
|
||||
and_(Images.emoji_hash == image_hash, Images.type == "emoji")
|
||||
)).scalar()
|
||||
|
||||
if existing_img:
|
||||
existing_img.path = file_path
|
||||
existing_img.description = detailed_description # 保存详细描述
|
||||
existing_img.timestamp = current_timestamp
|
||||
else:
|
||||
new_img = Images(
|
||||
emoji_hash=image_hash,
|
||||
path=file_path,
|
||||
type="emoji",
|
||||
description=detailed_description, # 保存详细描述
|
||||
timestamp=current_timestamp,
|
||||
)
|
||||
session.add(new_img)
|
||||
# session.commit() 会在上下文管理器中自动调用
|
||||
except Exception as e:
|
||||
logger.error(f"保存到Images表失败: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"保存表情包文件或元数据失败: {str(e)}")
|
||||
|
||||
# 保存最终的情感标签到缓存 (ImageDescriptions表)
|
||||
self._save_description_to_db(image_hash, final_emotion, "emoji")
|
||||
|
||||
return f"[表情包:{final_emotion}]"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取表情包描述失败: {str(e)}")
|
||||
return "[表情包(处理失败)]"
|
||||
|
||||
async def get_image_description(self, image_base64: str) -> str:
|
||||
"""获取普通图片描述,优先使用Images表中的缓存数据"""
|
||||
try:
|
||||
# 计算图片哈希
|
||||
if isinstance(image_base64, str):
|
||||
image_base64 = image_base64.encode("ascii", errors="ignore").decode("ascii")
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()
|
||||
|
||||
# 优先检查Images表中是否已有完整的描述
|
||||
with get_db_session() as session:
|
||||
existing_image = session.execute(select(Images).where(Images.emoji_hash == image_hash)).scalar()
|
||||
if existing_image:
|
||||
# 更新计数
|
||||
if hasattr(existing_image, "count") and existing_image.count is not None:
|
||||
existing_image.count += 1
|
||||
else:
|
||||
existing_image.count = 1
|
||||
|
||||
# 如果已有描述,直接返回
|
||||
if existing_image.description:
|
||||
logger.debug(f"[缓存命中] 使用Images表中的图片描述: {existing_image.description[:50]}...")
|
||||
return f"[图片:{existing_image.description}]"
|
||||
|
||||
if cached_description := self._get_description_from_db(image_hash, "image"):
|
||||
logger.debug(f"[缓存命中] 使用ImageDescriptions表中的描述: {cached_description[:50]}...")
|
||||
return f"[图片:{cached_description}]"
|
||||
|
||||
# 调用AI获取描述
|
||||
image_format = Image.open(io.BytesIO(image_bytes)).format.lower() # type: ignore
|
||||
prompt = global_config.custom_prompt.image_prompt
|
||||
logger.info(f"[VLM调用] 为图片生成新描述 (Hash: {image_hash[:8]}...)")
|
||||
description, _ = await self.vlm.generate_response_for_image(
|
||||
prompt, image_base64, image_format, temperature=0.4, max_tokens=300
|
||||
)
|
||||
|
||||
if description is None:
|
||||
logger.warning("AI未能生成图片描述")
|
||||
return "[图片(描述生成失败)]"
|
||||
|
||||
# 保存图片和描述
|
||||
current_timestamp = time.time()
|
||||
filename = f"{int(current_timestamp)}_{image_hash[:8]}.{image_format}"
|
||||
image_dir = os.path.join(self.IMAGE_DIR, "image")
|
||||
os.makedirs(image_dir, exist_ok=True)
|
||||
file_path = os.path.join(image_dir, filename)
|
||||
|
||||
try:
|
||||
# 保存文件
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
|
||||
# 保存到数据库,补充缺失字段
|
||||
if existing_image:
|
||||
existing_image.path = file_path
|
||||
existing_image.description = description
|
||||
existing_image.timestamp = current_timestamp
|
||||
if not hasattr(existing_image, "image_id") or not existing_image.image_id:
|
||||
existing_image.image_id = str(uuid.uuid4())
|
||||
if not hasattr(existing_image, "vlm_processed") or existing_image.vlm_processed is None:
|
||||
existing_image.vlm_processed = True
|
||||
session.commit()
|
||||
logger.debug(f"[数据库] 更新已有图片记录: {image_hash[:8]}...")
|
||||
else:
|
||||
new_img = Images(
|
||||
image_id=str(uuid.uuid4()),
|
||||
emoji_hash=image_hash,
|
||||
path=file_path,
|
||||
type="image",
|
||||
description=description,
|
||||
timestamp=current_timestamp,
|
||||
vlm_processed=True,
|
||||
count=1,
|
||||
)
|
||||
session.add(new_img)
|
||||
session.commit()
|
||||
logger.debug(f"[数据库] 创建新图片记录: {image_hash[:8]}...")
|
||||
except Exception as e:
|
||||
logger.error(f"保存图片文件或元数据失败: {str(e)}")
|
||||
|
||||
# 保存描述到ImageDescriptions表作为备用缓存
|
||||
self._save_description_to_db(image_hash, description, "image")
|
||||
|
||||
logger.info(f"[VLM完成] 图片描述生成: {description[:50]}...")
|
||||
return f"[图片:{description}]"
|
||||
except Exception as e:
|
||||
logger.error(f"获取图片描述失败: {str(e)}")
|
||||
return "[图片(处理失败)]"
|
||||
|
||||
@staticmethod
|
||||
def transform_gif(gif_base64: str, similarity_threshold: float = 1000.0, max_frames: int = 15) -> Optional[str]:
|
||||
# sourcery skip: use-contextlib-suppress
|
||||
"""将GIF转换为水平拼接的静态图像, 跳过相似的帧
|
||||
|
||||
Args:
|
||||
gif_base64: GIF的base64编码字符串
|
||||
similarity_threshold: 判定帧相似的阈值 (MSE),越小表示要求差异越大才算不同帧,默认1000.0
|
||||
max_frames: 最大抽取的帧数,默认15
|
||||
|
||||
Returns:
|
||||
Optional[str]: 拼接后的JPG图像的base64编码字符串, 或者在失败时返回None
|
||||
"""
|
||||
try:
|
||||
# 确保base64字符串只包含ASCII字符
|
||||
if isinstance(gif_base64, str):
|
||||
gif_base64 = gif_base64.encode("ascii", errors="ignore").decode("ascii")
|
||||
# 解码base64
|
||||
gif_data = base64.b64decode(gif_base64)
|
||||
gif = Image.open(io.BytesIO(gif_data))
|
||||
|
||||
# 收集所有帧
|
||||
all_frames = []
|
||||
try:
|
||||
while True:
|
||||
gif.seek(len(all_frames))
|
||||
# 确保是RGB格式方便比较
|
||||
frame = gif.convert("RGB")
|
||||
all_frames.append(frame.copy())
|
||||
except EOFError:
|
||||
pass # 读完啦
|
||||
|
||||
if not all_frames:
|
||||
logger.warning("GIF中没有找到任何帧")
|
||||
return None # 空的GIF直接返回None
|
||||
|
||||
# --- 新的帧选择逻辑 ---
|
||||
selected_frames = []
|
||||
last_selected_frame_np = None
|
||||
|
||||
for i, current_frame in enumerate(all_frames):
|
||||
current_frame_np = np.array(current_frame)
|
||||
|
||||
# 第一帧总是要选的
|
||||
if i == 0:
|
||||
selected_frames.append(current_frame)
|
||||
last_selected_frame_np = current_frame_np
|
||||
continue
|
||||
|
||||
# 计算和上一张选中帧的差异(均方误差 MSE)
|
||||
if last_selected_frame_np is not None:
|
||||
mse = np.mean((current_frame_np - last_selected_frame_np) ** 2)
|
||||
# logger.debug(f"帧 {i} 与上一选中帧的 MSE: {mse}") # 可以取消注释来看差异值
|
||||
|
||||
# 如果差异够大,就选它!
|
||||
if mse > similarity_threshold:
|
||||
selected_frames.append(current_frame)
|
||||
last_selected_frame_np = current_frame_np
|
||||
# 检查是不是选够了
|
||||
if len(selected_frames) >= max_frames:
|
||||
# logger.debug(f"已选够 {max_frames} 帧,停止选择。")
|
||||
break
|
||||
# 如果差异不大就跳过这一帧啦
|
||||
|
||||
# --- 帧选择逻辑结束 ---
|
||||
|
||||
# 如果选择后连一帧都没有(比如GIF只有一帧且后续处理失败?)或者原始GIF就没帧,也返回None
|
||||
if not selected_frames:
|
||||
logger.warning("处理后没有选中任何帧")
|
||||
return None
|
||||
|
||||
# logger.debug(f"总帧数: {len(all_frames)}, 选中帧数: {len(selected_frames)}")
|
||||
|
||||
# 获取选中的第一帧的尺寸(假设所有帧尺寸一致)
|
||||
frame_width, frame_height = selected_frames[0].size
|
||||
|
||||
# 计算目标尺寸,保持宽高比
|
||||
target_height = 200 # 固定高度
|
||||
# 防止除以零
|
||||
if frame_height == 0:
|
||||
logger.error("帧高度为0,无法计算缩放尺寸")
|
||||
return None
|
||||
target_width = int((target_height / frame_height) * frame_width)
|
||||
# 宽度也不能是0
|
||||
if target_width == 0:
|
||||
logger.warning(f"计算出的目标宽度为0 (原始尺寸 {frame_width}x{frame_height}),调整为1")
|
||||
target_width = 1
|
||||
|
||||
# 调整所有选中帧的大小
|
||||
resized_frames = [
|
||||
frame.resize((target_width, target_height), Image.Resampling.LANCZOS) for frame in selected_frames
|
||||
]
|
||||
|
||||
# 创建拼接图像
|
||||
total_width = target_width * len(resized_frames)
|
||||
# 防止总宽度为0
|
||||
if total_width == 0 and resized_frames:
|
||||
logger.warning("计算出的总宽度为0,但有选中帧,可能目标宽度太小")
|
||||
# 至少给点宽度吧
|
||||
total_width = len(resized_frames)
|
||||
elif total_width == 0:
|
||||
logger.error("计算出的总宽度为0且无选中帧")
|
||||
return None
|
||||
|
||||
combined_image = Image.new("RGB", (total_width, target_height))
|
||||
|
||||
# 水平拼接图像
|
||||
for idx, frame in enumerate(resized_frames):
|
||||
combined_image.paste(frame, (idx * target_width, 0))
|
||||
|
||||
# 转换为base64
|
||||
buffer = io.BytesIO()
|
||||
combined_image.save(buffer, format="JPEG", quality=85) # 保存为JPEG
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
except MemoryError:
|
||||
logger.error("GIF转换失败: 内存不足,可能是GIF太大或帧数太多")
|
||||
return None # 内存不够啦
|
||||
except Exception as e:
|
||||
logger.error(f"GIF转换失败: {str(e)}", exc_info=True) # 记录详细错误信息
|
||||
return None # 其他错误也返回None
|
||||
|
||||
async def process_image(self, image_base64: str) -> Tuple[str, str]:
|
||||
# sourcery skip: hoist-if-from-if
|
||||
"""处理图片并返回图片ID和描述
|
||||
|
||||
Args:
|
||||
image_base64: 图片的base64编码
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: (图片ID, 描述)
|
||||
"""
|
||||
try:
|
||||
# 生成图片ID
|
||||
# 计算图片哈希
|
||||
# 确保base64字符串只包含ASCII字符
|
||||
if isinstance(image_base64, str):
|
||||
image_base64 = image_base64.encode("ascii", errors="ignore").decode("ascii")
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()
|
||||
with get_db_session() as session:
|
||||
existing_image = session.execute(select(Images).where(Images.emoji_hash == image_hash)).scalar()
|
||||
if existing_image:
|
||||
# 检查是否缺少必要字段,如果缺少则创建新记录
|
||||
if (
|
||||
not hasattr(existing_image, "image_id")
|
||||
or not existing_image.image_id
|
||||
or not hasattr(existing_image, "count")
|
||||
or existing_image.count is None
|
||||
or not hasattr(existing_image, "vlm_processed")
|
||||
or existing_image.vlm_processed is None
|
||||
):
|
||||
logger.debug(f"图片记录缺少必要字段,补全旧记录: {image_hash}")
|
||||
if not existing_image.image_id:
|
||||
existing_image.image_id = str(uuid.uuid4())
|
||||
if existing_image.count is None:
|
||||
existing_image.count = 0
|
||||
if existing_image.vlm_processed is None:
|
||||
existing_image.vlm_processed = False
|
||||
|
||||
existing_image.count += 1
|
||||
session.commit()
|
||||
return existing_image.image_id, f"[picid:{existing_image.image_id}]"
|
||||
|
||||
# print(f"图片不存在: {image_hash}")
|
||||
image_id = str(uuid.uuid4())
|
||||
|
||||
# 保存新图片
|
||||
current_timestamp = time.time()
|
||||
image_dir = os.path.join(self.IMAGE_DIR, "images")
|
||||
os.makedirs(image_dir, exist_ok=True)
|
||||
filename = f"{image_id}.png"
|
||||
file_path = os.path.join(image_dir, filename)
|
||||
|
||||
# 保存文件
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
|
||||
# 保存到数据库
|
||||
new_img = Images(
|
||||
image_id=image_id,
|
||||
emoji_hash=image_hash,
|
||||
path=file_path,
|
||||
type="image",
|
||||
timestamp=current_timestamp,
|
||||
vlm_processed=False,
|
||||
count=1,
|
||||
)
|
||||
session.add(new_img)
|
||||
session.commit()
|
||||
|
||||
# 启动异步VLM处理
|
||||
asyncio.create_task(self._process_image_with_vlm(image_id, image_base64))
|
||||
|
||||
return image_id, f"[picid:{image_id}]"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理图片失败: {str(e)}")
|
||||
return "", "[图片]"
|
||||
|
||||
async def _process_image_with_vlm(self, image_id: str, image_base64: str) -> None:
|
||||
"""使用VLM处理图片并更新数据库
|
||||
|
||||
Args:
|
||||
image_id: 图片ID
|
||||
image_base64: 图片的base64编码
|
||||
"""
|
||||
try:
|
||||
# 计算图片哈希
|
||||
# 确保base64字符串只包含ASCII字符
|
||||
if isinstance(image_base64, str):
|
||||
image_base64 = image_base64.encode("ascii", errors="ignore").decode("ascii")
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image_hash = hashlib.md5(image_bytes).hexdigest()
|
||||
with get_db_session() as session:
|
||||
# 获取当前图片记录
|
||||
image = session.execute(select(Images).where(Images.image_id == image_id)).scalar()
|
||||
|
||||
# 优先检查是否已有其他相同哈希的图片记录包含描述
|
||||
existing_with_description = session.execute(select(Images).where(
|
||||
and_(
|
||||
Images.emoji_hash == image_hash,
|
||||
Images.description.isnot(None),
|
||||
Images.description != "",
|
||||
Images.id != image.id
|
||||
)
|
||||
)).scalar()
|
||||
if existing_with_description:
|
||||
logger.debug(f"[缓存复用] 从其他相同图片记录复用描述: {existing_with_description.description[:50]}...")
|
||||
image.description = existing_with_description.description
|
||||
image.vlm_processed = True
|
||||
session.commit()
|
||||
# 同时保存到ImageDescriptions表作为备用缓存
|
||||
self._save_description_to_db(image_hash, existing_with_description.description, "image")
|
||||
return
|
||||
|
||||
# 检查ImageDescriptions表的缓存描述
|
||||
if cached_description := self._get_description_from_db(image_hash, "image"):
|
||||
logger.debug(f"[缓存复用] 从ImageDescriptions表复用描述: {cached_description[:50]}...")
|
||||
image.description = cached_description
|
||||
image.vlm_processed = True
|
||||
session.commit()
|
||||
return
|
||||
|
||||
# 获取图片格式
|
||||
image_format = Image.open(io.BytesIO(image_bytes)).format.lower() # type: ignore
|
||||
|
||||
# 构建prompt
|
||||
prompt = global_config.custom_prompt.image_prompt
|
||||
|
||||
# 获取VLM描述
|
||||
logger.info(f"[VLM异步调用] 为图片生成描述 (ID: {image_id}, Hash: {image_hash[:8]}...)")
|
||||
description, _ = await self.vlm.generate_response_for_image(
|
||||
prompt, image_base64, image_format, temperature=0.4, max_tokens=300
|
||||
)
|
||||
|
||||
if description is None:
|
||||
logger.warning("VLM未能生成图片描述")
|
||||
description = "无法生成描述"
|
||||
|
||||
if cached_description := self._get_description_from_db(image_hash, "image"):
|
||||
logger.warning(f"虽然生成了描述,但是找到缓存图片描述: {cached_description}")
|
||||
description = cached_description
|
||||
|
||||
# 更新数据库
|
||||
image.description = description
|
||||
image.vlm_processed = True
|
||||
|
||||
# 保存描述到ImageDescriptions表作为备用缓存
|
||||
self._save_description_to_db(image_hash, description, "image")
|
||||
|
||||
logger.info(f"[VLM异步完成] 图片描述生成: {description[:50]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"VLM处理图片失败: {str(e)}")
|
||||
|
||||
|
||||
# 创建全局单例
|
||||
image_manager = None
|
||||
|
||||
|
||||
def get_image_manager() -> ImageManager:
|
||||
"""获取全局图片管理器单例"""
|
||||
global image_manager
|
||||
if image_manager is None:
|
||||
image_manager = ImageManager()
|
||||
return image_manager
|
||||
|
||||
|
||||
def image_path_to_base64(image_path: str) -> str:
|
||||
"""将图片路径转换为base64编码
|
||||
Args:
|
||||
image_path: 图片文件路径
|
||||
Returns:
|
||||
str: base64编码的图片数据
|
||||
Raises:
|
||||
FileNotFoundError: 当图片文件不存在时
|
||||
IOError: 当读取图片文件失败时
|
||||
"""
|
||||
if not os.path.exists(image_path):
|
||||
raise FileNotFoundError(f"图片文件不存在: {image_path}")
|
||||
|
||||
with open(image_path, "rb") as f:
|
||||
if image_data := f.read():
|
||||
return base64.b64encode(image_data).decode("utf-8")
|
||||
else:
|
||||
raise IOError(f"读取图片文件失败: {image_path}")
|
||||
29
src/chat/utils/utils_voice.py
Normal file
29
src/chat/utils/utils_voice.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from src.config.config import global_config, model_config
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from rich.traceback import install
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
logger = get_logger("chat_voice")
|
||||
|
||||
|
||||
async def get_voice_text(voice_base64: str) -> str:
|
||||
"""获取音频文件转录文本"""
|
||||
if not global_config.voice.enable_asr:
|
||||
logger.warning("语音识别未启用,无法处理语音消息")
|
||||
return "[语音]"
|
||||
try:
|
||||
_llm = LLMRequest(model_set=model_config.model_task_config.voice, request_type="audio")
|
||||
text = await _llm.generate_response_for_voice(voice_base64)
|
||||
if text is None:
|
||||
logger.warning("未能生成语音文本")
|
||||
return "[语音(文本生成失败)]"
|
||||
|
||||
logger.debug(f"描述是{text}")
|
||||
|
||||
return f"[语音:{text}]"
|
||||
except Exception as e:
|
||||
logger.error(f"语音转文字失败: {str(e)}")
|
||||
return "[语音]"
|
||||
60
src/chat/willing/mode_classical.py
Normal file
60
src/chat/willing/mode_classical.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import asyncio
|
||||
|
||||
from src.config.config import global_config
|
||||
from .willing_manager import BaseWillingManager
|
||||
|
||||
|
||||
class ClassicalWillingManager(BaseWillingManager):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._decay_task: asyncio.Task | None = None
|
||||
|
||||
async def _decay_reply_willing(self):
|
||||
"""定期衰减回复意愿"""
|
||||
while True:
|
||||
await asyncio.sleep(1)
|
||||
for chat_id in self.chat_reply_willing:
|
||||
self.chat_reply_willing[chat_id] = max(0.0, self.chat_reply_willing[chat_id] * 0.9)
|
||||
|
||||
async def async_task_starter(self):
|
||||
if self._decay_task is None:
|
||||
self._decay_task = asyncio.create_task(self._decay_reply_willing())
|
||||
|
||||
async def get_reply_probability(self, message_id):
|
||||
willing_info = self.ongoing_messages[message_id]
|
||||
chat_id = willing_info.chat_id
|
||||
current_willing = self.chat_reply_willing.get(chat_id, 0)
|
||||
|
||||
# print(f"[{chat_id}] 回复意愿: {current_willing}")
|
||||
|
||||
interested_rate = willing_info.interested_rate
|
||||
|
||||
# print(f"[{chat_id}] 兴趣值: {interested_rate}")
|
||||
|
||||
current_willing += interested_rate
|
||||
|
||||
if willing_info.is_mentioned_bot and global_config.chat.mentioned_bot_inevitable_reply and current_willing < 2:
|
||||
current_willing += 1 if current_willing < 1.0 else 0.2
|
||||
|
||||
self.chat_reply_willing[chat_id] = min(current_willing, 1.0)
|
||||
|
||||
reply_probability = min(max((current_willing - 0.5), 0.01) * 2, 1.5)
|
||||
|
||||
# print(f"[{chat_id}] 回复概率: {reply_probability}")
|
||||
|
||||
return reply_probability
|
||||
|
||||
async def before_generate_reply_handle(self, message_id):
|
||||
pass
|
||||
|
||||
async def after_generate_reply_handle(self, message_id):
|
||||
if message_id not in self.ongoing_messages:
|
||||
return
|
||||
|
||||
chat_id = self.ongoing_messages[message_id].chat_id
|
||||
current_willing = self.chat_reply_willing.get(chat_id, 0)
|
||||
if current_willing < 1:
|
||||
self.chat_reply_willing[chat_id] = min(1.0, current_willing + 0.3)
|
||||
|
||||
async def not_reply_handle(self, message_id):
|
||||
return await super().not_reply_handle(message_id)
|
||||
23
src/chat/willing/mode_custom.py
Normal file
23
src/chat/willing/mode_custom.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from .willing_manager import BaseWillingManager
|
||||
|
||||
NOT_IMPLEMENTED_MESSAGE = "\ncustom模式你实现了吗?没自行实现不要选custom。给你退了快点给你麦爹配置\n注:以上内容由gemini生成,如有不满请投诉gemini"
|
||||
|
||||
class CustomWillingManager(BaseWillingManager):
|
||||
async def async_task_starter(self) -> None:
|
||||
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
|
||||
|
||||
async def before_generate_reply_handle(self, message_id: str):
|
||||
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
|
||||
|
||||
async def after_generate_reply_handle(self, message_id: str):
|
||||
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
|
||||
|
||||
async def not_reply_handle(self, message_id: str):
|
||||
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
|
||||
|
||||
async def get_reply_probability(self, message_id: str):
|
||||
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
|
||||
296
src/chat/willing/mode_mxp.py
Normal file
296
src/chat/willing/mode_mxp.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
Mxp 模式:梦溪畔独家赞助
|
||||
此模式的一些参数不会在配置文件中显示,要修改请在可变参数下修改
|
||||
同时一些全局设置对此模式无效
|
||||
此模式的可变参数暂时比较草率,需要调参仙人的大手
|
||||
此模式的特点:
|
||||
1.每个聊天流的每个用户的意愿是独立的
|
||||
2.接入关系系统,关系会影响意愿值(已移除,因为关系系统重构)
|
||||
3.会根据群聊的热度来调整基础意愿值
|
||||
4.限制同时思考的消息数量,防止喷射
|
||||
5.拥有单聊增益,无论在群里还是私聊,只要bot一直和你聊,就会增加意愿值
|
||||
6.意愿分为衰减意愿+临时意愿
|
||||
7.疲劳机制
|
||||
|
||||
如果你发现本模式出现了bug
|
||||
上上策是询问智慧的小草神()
|
||||
上策是询问万能的千石可乐
|
||||
中策是发issue
|
||||
下下策是询问一个菜鸟(@梦溪畔)
|
||||
"""
|
||||
|
||||
from typing import Dict
|
||||
import asyncio
|
||||
import time
|
||||
import math
|
||||
|
||||
from src.chat.message_receive.chat_stream import ChatStream
|
||||
from .willing_manager import BaseWillingManager
|
||||
|
||||
|
||||
class MxpWillingManager(BaseWillingManager):
|
||||
"""Mxp意愿管理器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.chat_person_reply_willing: Dict[str, Dict[str, float]] = {} # chat_id: {person_id: 意愿值}
|
||||
self.chat_new_message_time: Dict[str, list[float]] = {} # 聊天流ID: 消息时间
|
||||
self.last_response_person: Dict[str, tuple[str, int]] = {} # 上次回复的用户信息
|
||||
self.temporary_willing: float = 0 # 临时意愿值
|
||||
self.chat_bot_message_time: Dict[str, list[float]] = {} # 聊天流ID: bot已回复消息时间
|
||||
self.chat_fatigue_punishment_list: Dict[
|
||||
str, list[tuple[float, float]]
|
||||
] = {} # 聊天流疲劳惩罚列, 聊天流ID: 惩罚时间列(开始时间,持续时间)
|
||||
self.chat_fatigue_willing_attenuation: Dict[str, float] = {} # 聊天流疲劳意愿衰减值
|
||||
|
||||
# 可变参数
|
||||
self.intention_decay_rate = 0.93 # 意愿衰减率
|
||||
|
||||
self.number_of_message_storage = 12 # 消息存储数量
|
||||
self.expected_replies_per_min = 3 # 每分钟预期回复数
|
||||
self.basic_maximum_willing = 0.5 # 基础最大意愿值
|
||||
|
||||
self.mention_willing_gain = 0.6 # 提及意愿增益
|
||||
self.interest_willing_gain = 0.3 # 兴趣意愿增益
|
||||
self.single_chat_gain = 0.12 # 单聊增益
|
||||
|
||||
self.fatigue_messages_triggered_num = self.expected_replies_per_min # 疲劳消息触发数量(int)
|
||||
self.fatigue_coefficient = 1.0 # 疲劳系数
|
||||
|
||||
self.is_debug = False # 是否开启调试模式
|
||||
|
||||
async def async_task_starter(self) -> None:
|
||||
"""异步任务启动器"""
|
||||
asyncio.create_task(self._return_to_basic_willing())
|
||||
asyncio.create_task(self._chat_new_message_to_change_basic_willing())
|
||||
asyncio.create_task(self._fatigue_attenuation())
|
||||
|
||||
async def before_generate_reply_handle(self, message_id: str):
|
||||
"""回复前处理"""
|
||||
current_time = time.time()
|
||||
async with self.lock:
|
||||
w_info = self.ongoing_messages[message_id]
|
||||
if w_info.chat_id not in self.chat_bot_message_time:
|
||||
self.chat_bot_message_time[w_info.chat_id] = []
|
||||
self.chat_bot_message_time[w_info.chat_id] = [
|
||||
t for t in self.chat_bot_message_time[w_info.chat_id] if current_time - t < 60
|
||||
]
|
||||
self.chat_bot_message_time[w_info.chat_id].append(current_time)
|
||||
if len(self.chat_bot_message_time[w_info.chat_id]) == int(self.fatigue_messages_triggered_num):
|
||||
time_interval = 60 - (current_time - self.chat_bot_message_time[w_info.chat_id].pop(0))
|
||||
self.chat_fatigue_punishment_list[w_info.chat_id].append((current_time, time_interval * 2))
|
||||
|
||||
async def after_generate_reply_handle(self, message_id: str):
|
||||
"""回复后处理"""
|
||||
async with self.lock:
|
||||
w_info = self.ongoing_messages[message_id]
|
||||
# 移除关系值相关代码
|
||||
# rel_value = await w_info.person_info_manager.get_value(w_info.person_id, "relationship_value")
|
||||
# rel_level = self._get_relationship_level_num(rel_value)
|
||||
# self.chat_person_reply_willing[w_info.chat_id][w_info.person_id] += rel_level * 0.05
|
||||
|
||||
now_chat_new_person = self.last_response_person.get(w_info.chat_id, (w_info.person_id, 0))
|
||||
if now_chat_new_person[0] == w_info.person_id:
|
||||
if now_chat_new_person[1] < 3:
|
||||
tmp_list = list(now_chat_new_person)
|
||||
tmp_list[1] += 1 # type: ignore
|
||||
self.last_response_person[w_info.chat_id] = tuple(tmp_list) # type: ignore
|
||||
else:
|
||||
self.last_response_person[w_info.chat_id] = (w_info.person_id, 0)
|
||||
|
||||
async def not_reply_handle(self, message_id: str):
|
||||
"""不回复处理"""
|
||||
async with self.lock:
|
||||
w_info = self.ongoing_messages[message_id]
|
||||
if w_info.is_mentioned_bot:
|
||||
self.chat_person_reply_willing[w_info.chat_id][w_info.person_id] += self.mention_willing_gain / 2.5
|
||||
if (
|
||||
w_info.chat_id in self.last_response_person
|
||||
and self.last_response_person[w_info.chat_id][0] == w_info.person_id
|
||||
and self.last_response_person[w_info.chat_id][1]
|
||||
):
|
||||
self.chat_person_reply_willing[w_info.chat_id][w_info.person_id] += self.single_chat_gain * (
|
||||
2 * self.last_response_person[w_info.chat_id][1] - 1
|
||||
)
|
||||
now_chat_new_person = self.last_response_person.get(w_info.chat_id, ("", 0))
|
||||
if now_chat_new_person[0] != w_info.person_id:
|
||||
self.last_response_person[w_info.chat_id] = (w_info.person_id, 0)
|
||||
|
||||
async def get_reply_probability(self, message_id: str):
|
||||
# sourcery skip: merge-duplicate-blocks, remove-redundant-if
|
||||
"""获取回复概率"""
|
||||
async with self.lock:
|
||||
w_info = self.ongoing_messages[message_id]
|
||||
current_willing = self.chat_person_reply_willing[w_info.chat_id][w_info.person_id]
|
||||
if self.is_debug:
|
||||
self.logger.debug(f"基础意愿值:{current_willing}")
|
||||
|
||||
if w_info.is_mentioned_bot:
|
||||
willing_gain = self.mention_willing_gain / (int(current_willing) + 1)
|
||||
current_willing += willing_gain
|
||||
if self.is_debug:
|
||||
self.logger.debug(f"提及增益:{willing_gain}")
|
||||
|
||||
if w_info.interested_rate > 0:
|
||||
willing_gain = math.atan(w_info.interested_rate / 2) / math.pi * 2 * self.interest_willing_gain
|
||||
current_willing += willing_gain
|
||||
if self.is_debug:
|
||||
self.logger.debug(f"兴趣增益:{willing_gain}")
|
||||
|
||||
self.chat_person_reply_willing[w_info.chat_id][w_info.person_id] = current_willing
|
||||
|
||||
# 添加单聊增益
|
||||
if (
|
||||
w_info.chat_id in self.last_response_person
|
||||
and self.last_response_person[w_info.chat_id][0] == w_info.person_id
|
||||
and self.last_response_person[w_info.chat_id][1]
|
||||
):
|
||||
current_willing += self.single_chat_gain * (2 * self.last_response_person[w_info.chat_id][1] + 1)
|
||||
if self.is_debug:
|
||||
self.logger.debug(
|
||||
f"单聊增益:{self.single_chat_gain * (2 * self.last_response_person[w_info.chat_id][1] + 1)}"
|
||||
)
|
||||
|
||||
current_willing += self.chat_fatigue_willing_attenuation.get(w_info.chat_id, 0)
|
||||
if self.is_debug:
|
||||
self.logger.debug(f"疲劳衰减:{self.chat_fatigue_willing_attenuation.get(w_info.chat_id, 0)}")
|
||||
|
||||
chat_ongoing_messages = [msg for msg in self.ongoing_messages.values() if msg.chat_id == w_info.chat_id]
|
||||
chat_person_ongoing_messages = [msg for msg in chat_ongoing_messages if msg.person_id == w_info.person_id]
|
||||
if len(chat_person_ongoing_messages) >= 2:
|
||||
current_willing = 0
|
||||
if self.is_debug:
|
||||
self.logger.debug("进行中消息惩罚:归0")
|
||||
elif len(chat_ongoing_messages) == 2:
|
||||
current_willing -= 0.5
|
||||
if self.is_debug:
|
||||
self.logger.debug("进行中消息惩罚:-0.5")
|
||||
elif len(chat_ongoing_messages) == 3:
|
||||
current_willing -= 1.5
|
||||
if self.is_debug:
|
||||
self.logger.debug("进行中消息惩罚:-1.5")
|
||||
elif len(chat_ongoing_messages) >= 4:
|
||||
current_willing = 0
|
||||
if self.is_debug:
|
||||
self.logger.debug("进行中消息惩罚:归0")
|
||||
|
||||
probability = self._willing_to_probability(current_willing)
|
||||
|
||||
self.temporary_willing = current_willing
|
||||
|
||||
return probability
|
||||
|
||||
async def _return_to_basic_willing(self):
|
||||
"""使每个人的意愿恢复到chat基础意愿"""
|
||||
while True:
|
||||
await asyncio.sleep(3)
|
||||
async with self.lock:
|
||||
for chat_id, person_willing in self.chat_person_reply_willing.items():
|
||||
for person_id, willing in person_willing.items():
|
||||
if chat_id not in self.chat_reply_willing:
|
||||
self.logger.debug(f"聊天流{chat_id}不存在,错误")
|
||||
continue
|
||||
basic_willing = self.chat_reply_willing[chat_id]
|
||||
person_willing[person_id] = (
|
||||
basic_willing + (willing - basic_willing) * self.intention_decay_rate
|
||||
)
|
||||
|
||||
def setup(self, message: dict, chat_stream: ChatStream):
|
||||
super().setup(message, chat_stream)
|
||||
stream_id = chat_stream.stream_id
|
||||
self.chat_reply_willing[stream_id] = self.chat_reply_willing.get(stream_id, self.basic_maximum_willing)
|
||||
self.chat_person_reply_willing[stream_id] = self.chat_person_reply_willing.get(stream_id, {})
|
||||
self.chat_person_reply_willing[stream_id][self.ongoing_messages[message.get("message_id", "")].person_id] = (
|
||||
self.chat_person_reply_willing[stream_id].get(
|
||||
self.ongoing_messages[message.get("message_id", "")].person_id,
|
||||
self.chat_reply_willing[stream_id],
|
||||
)
|
||||
)
|
||||
|
||||
current_time = time.time()
|
||||
if stream_id not in self.chat_new_message_time:
|
||||
self.chat_new_message_time[stream_id] = []
|
||||
self.chat_new_message_time[stream_id].append(current_time)
|
||||
if len(self.chat_new_message_time[stream_id]) > self.number_of_message_storage:
|
||||
self.chat_new_message_time[stream_id].pop(0)
|
||||
|
||||
if stream_id not in self.chat_fatigue_punishment_list:
|
||||
self.chat_fatigue_punishment_list[stream_id] = [
|
||||
(
|
||||
current_time,
|
||||
self.number_of_message_storage * self.basic_maximum_willing / self.expected_replies_per_min * 60,
|
||||
)
|
||||
]
|
||||
self.chat_fatigue_willing_attenuation[stream_id] = (
|
||||
-2 * self.basic_maximum_willing * self.fatigue_coefficient
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _willing_to_probability(willing: float) -> float:
|
||||
"""意愿值转化为概率"""
|
||||
willing = max(0, willing)
|
||||
if willing < 2:
|
||||
return math.atan(willing * 2) / math.pi * 2
|
||||
elif willing < 2.5:
|
||||
return math.atan(willing * 4) / math.pi * 2
|
||||
else:
|
||||
return 1
|
||||
|
||||
async def _chat_new_message_to_change_basic_willing(self):
|
||||
"""聊天流新消息改变基础意愿"""
|
||||
update_time = 20
|
||||
while True:
|
||||
await asyncio.sleep(update_time)
|
||||
async with self.lock:
|
||||
for chat_id, message_times in self.chat_new_message_time.items():
|
||||
# 清理过期消息
|
||||
current_time = time.time()
|
||||
message_times = [
|
||||
msg_time
|
||||
for msg_time in message_times
|
||||
if current_time - msg_time
|
||||
< self.number_of_message_storage
|
||||
* self.basic_maximum_willing
|
||||
/ self.expected_replies_per_min
|
||||
* 60
|
||||
]
|
||||
self.chat_new_message_time[chat_id] = message_times
|
||||
|
||||
if len(message_times) < self.number_of_message_storage:
|
||||
self.chat_reply_willing[chat_id] = self.basic_maximum_willing
|
||||
update_time = 20
|
||||
elif len(message_times) == self.number_of_message_storage:
|
||||
time_interval = current_time - message_times[0]
|
||||
basic_willing = self._basic_willing_calculate(time_interval)
|
||||
self.chat_reply_willing[chat_id] = basic_willing
|
||||
update_time = 17 * basic_willing / self.basic_maximum_willing + 3
|
||||
else:
|
||||
self.logger.debug(f"聊天流{chat_id}消息时间数量异常,数量:{len(message_times)}")
|
||||
self.chat_reply_willing[chat_id] = 0
|
||||
if self.is_debug:
|
||||
self.logger.debug(f"聊天流意愿值更新:{self.chat_reply_willing}")
|
||||
|
||||
def _basic_willing_calculate(self, t: float) -> float:
|
||||
"""基础意愿值计算"""
|
||||
return math.tan(t * self.expected_replies_per_min * math.pi / 120 / self.number_of_message_storage) / 2
|
||||
|
||||
async def _fatigue_attenuation(self):
|
||||
"""疲劳衰减"""
|
||||
while True:
|
||||
await asyncio.sleep(1)
|
||||
current_time = time.time()
|
||||
async with self.lock:
|
||||
for chat_id, fatigue_list in self.chat_fatigue_punishment_list.items():
|
||||
fatigue_list = [z for z in fatigue_list if current_time - z[0] < z[1]]
|
||||
self.chat_fatigue_willing_attenuation[chat_id] = 0
|
||||
for start_time, duration in fatigue_list:
|
||||
self.chat_fatigue_willing_attenuation[chat_id] += (
|
||||
self.chat_reply_willing[chat_id]
|
||||
* 2
|
||||
/ math.pi
|
||||
* math.asin(2 * (current_time - start_time) / duration - 1)
|
||||
- self.chat_reply_willing[chat_id]
|
||||
) * self.fatigue_coefficient
|
||||
|
||||
async def get_willing(self, chat_id):
|
||||
return self.temporary_willing
|
||||
180
src/chat/willing/willing_manager.py
Normal file
180
src/chat/willing/willing_manager.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import importlib
|
||||
import asyncio
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional, Any
|
||||
from rich.traceback import install
|
||||
from dataclasses import dataclass
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.config.config import global_config
|
||||
from src.chat.message_receive.chat_stream import ChatStream, GroupInfo
|
||||
from src.person_info.person_info import PersonInfoManager, get_person_info_manager
|
||||
|
||||
install(extra_lines=3)
|
||||
|
||||
"""
|
||||
基类方法概览:
|
||||
以下8个方法是你必须在子类重写的(哪怕什么都不干):
|
||||
async_task_starter 在程序启动时执行,在其中用asyncio.create_task启动你想要执行的异步任务
|
||||
before_generate_reply_handle 确定要回复后,在生成回复前的处理
|
||||
after_generate_reply_handle 确定要回复后,在生成回复后的处理
|
||||
not_reply_handle 确定不回复后的处理
|
||||
get_reply_probability 获取回复概率
|
||||
get_variable_parameters 暂不确定
|
||||
set_variable_parameters 暂不确定
|
||||
以下2个方法根据你的实现可以做调整:
|
||||
get_willing 获取某聊天流意愿
|
||||
set_willing 设置某聊天流意愿
|
||||
规范说明:
|
||||
模块文件命名: `mode_{manager_type}.py`
|
||||
示例: 若 `manager_type="aggressive"`,则模块文件应为 `mode_aggressive.py`
|
||||
类命名: `{manager_type}WillingManager` (首字母大写)
|
||||
示例: 在 `mode_aggressive.py` 中,类名应为 `AggressiveWillingManager`
|
||||
"""
|
||||
|
||||
|
||||
logger = get_logger("willing")
|
||||
|
||||
|
||||
@dataclass
|
||||
class WillingInfo:
|
||||
"""此类保存意愿模块常用的参数
|
||||
|
||||
Attributes:
|
||||
message (MessageRecv): 原始消息对象
|
||||
chat (ChatStream): 聊天流对象
|
||||
person_info_manager (PersonInfoManager): 用户信息管理对象
|
||||
chat_id (str): 当前聊天流的标识符
|
||||
person_id (str): 发送者的个人信息的标识符
|
||||
group_id (str): 群组ID(如果是私聊则为空)
|
||||
is_mentioned_bot (bool): 是否提及了bot
|
||||
is_emoji (bool): 是否为表情包
|
||||
interested_rate (float): 兴趣度
|
||||
"""
|
||||
|
||||
message: Dict[str, Any] # 原始消息数据
|
||||
chat: ChatStream
|
||||
person_info_manager: PersonInfoManager
|
||||
chat_id: str
|
||||
person_id: str
|
||||
group_info: Optional[GroupInfo]
|
||||
is_mentioned_bot: bool
|
||||
is_emoji: bool
|
||||
is_picid: bool
|
||||
interested_rate: float
|
||||
# current_mood: float 当前心情?
|
||||
|
||||
|
||||
class BaseWillingManager(ABC):
|
||||
"""回复意愿管理基类"""
|
||||
|
||||
@classmethod
|
||||
def create(cls, manager_type: str) -> "BaseWillingManager":
|
||||
try:
|
||||
module = importlib.import_module(f".mode_{manager_type}", __package__)
|
||||
manager_class = getattr(module, f"{manager_type.capitalize()}WillingManager")
|
||||
if not issubclass(manager_class, cls):
|
||||
raise TypeError(f"Manager class {manager_class.__name__} is not a subclass of {cls.__name__}")
|
||||
else:
|
||||
logger.info(f"普通回复模式:{manager_type}")
|
||||
return manager_class()
|
||||
except (ImportError, AttributeError, TypeError) as e:
|
||||
module = importlib.import_module(".mode_classical", __package__)
|
||||
manager_class = module.ClassicalWillingManager
|
||||
logger.info(f"载入当前意愿模式{manager_type}失败,使用经典配方~~~~")
|
||||
logger.debug(f"加载willing模式{manager_type}失败,原因: {str(e)}。")
|
||||
return manager_class()
|
||||
|
||||
def __init__(self):
|
||||
self.chat_reply_willing: Dict[str, float] = {} # 存储每个聊天流的回复意愿(chat_id)
|
||||
self.ongoing_messages: Dict[str, WillingInfo] = {} # 当前正在进行的消息(message_id)
|
||||
self.lock = asyncio.Lock()
|
||||
self.logger = logger
|
||||
|
||||
def setup(self, message: dict, chat: ChatStream):
|
||||
person_id = PersonInfoManager.get_person_id(chat.platform, chat.user_info.user_id) # type: ignore
|
||||
self.ongoing_messages[message.get("message_id", "")] = WillingInfo(
|
||||
message=message,
|
||||
chat=chat,
|
||||
person_info_manager=get_person_info_manager(),
|
||||
chat_id=chat.stream_id,
|
||||
person_id=person_id,
|
||||
group_info=chat.group_info,
|
||||
is_mentioned_bot=message.get("is_mentioned", False),
|
||||
is_emoji=message.get("is_emoji", False),
|
||||
is_picid=message.get("is_picid", False),
|
||||
interested_rate = message.get("interest_value") or 0.0,
|
||||
)
|
||||
|
||||
def delete(self, message_id: str):
|
||||
del_message = self.ongoing_messages.pop(message_id, None)
|
||||
if not del_message:
|
||||
logger.debug(f"尝试删除不存在的消息 ID: {message_id},可能已被其他流程处理,喵~")
|
||||
|
||||
@abstractmethod
|
||||
async def async_task_starter(self) -> None:
|
||||
"""抽象方法:异步任务启动器"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def before_generate_reply_handle(self, message_id: str):
|
||||
"""抽象方法:回复前处理"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def after_generate_reply_handle(self, message_id: str):
|
||||
"""抽象方法:回复后处理"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def not_reply_handle(self, message_id: str):
|
||||
"""抽象方法:不回复处理"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_reply_probability(self, message_id: str):
|
||||
"""抽象方法:获取回复概率"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def get_willing(self, chat_id: str):
|
||||
"""获取指定聊天流的回复意愿"""
|
||||
async with self.lock:
|
||||
return self.chat_reply_willing.get(chat_id, 0)
|
||||
|
||||
async def set_willing(self, chat_id: str, willing: float):
|
||||
"""设置指定聊天流的回复意愿"""
|
||||
async with self.lock:
|
||||
self.chat_reply_willing[chat_id] = willing
|
||||
|
||||
# @abstractmethod
|
||||
# async def get_variable_parameters(self) -> Dict[str, str]:
|
||||
# """抽象方法:获取可变参数"""
|
||||
# pass
|
||||
|
||||
# @abstractmethod
|
||||
# async def set_variable_parameters(self, parameters: Dict[str, any]):
|
||||
# """抽象方法:设置可变参数"""
|
||||
# pass
|
||||
|
||||
|
||||
def init_willing_manager() -> BaseWillingManager:
|
||||
"""
|
||||
根据配置初始化并返回对应的WillingManager实例
|
||||
|
||||
Returns:
|
||||
对应mode的WillingManager实例
|
||||
"""
|
||||
mode = global_config.normal_chat.willing_mode.lower()
|
||||
return BaseWillingManager.create(mode)
|
||||
|
||||
|
||||
# 全局willing_manager对象
|
||||
willing_manager = None
|
||||
|
||||
|
||||
def get_willing_manager():
|
||||
global willing_manager
|
||||
if willing_manager is None:
|
||||
willing_manager = init_willing_manager()
|
||||
return willing_manager
|
||||
Reference in New Issue
Block a user