refactor(deps): 将jieba分词库替换为rjieba

This commit is contained in:
雅诺狐
2025-10-05 12:08:18 +08:00
committed by Windpicker-owo
parent fee04c0d25
commit 1c9c4884c6
13 changed files with 85 additions and 29 deletions

View File

@@ -19,7 +19,7 @@ from src.chat.memory_system.memory_builder import MemoryBuilder, MemoryExtractio
from src.chat.memory_system.memory_chunk import MemoryChunk
from src.chat.memory_system.memory_fusion import MemoryFusionEngine
from src.chat.memory_system.memory_query_planner import MemoryQueryPlanner
# 简化的记忆采样模式枚举
# 记忆采样模式枚举
class MemorySamplingMode(Enum):
"""记忆采样模式"""
HIPPOCAMPUS = "hippocampus" # 海马体模式:定时任务采样
@@ -162,6 +162,7 @@ class MemorySystem:
async def initialize(self):
"""异步初始化记忆系统"""
try:
logger.info("正在初始化记忆系统...")
# 初始化LLM模型
fallback_task = getattr(self.llm_model, "model_for_task", None) if self.llm_model else None
@@ -267,8 +268,11 @@ class MemorySystem:
logger.warning(f"海马体采样器初始化失败: {e}")
self.hippocampus_sampler = None
# 统一存储已经自动加载数据,无需额外加载
logger.info("✅ 简化版记忆系统初始化完成")
self.status = MemorySystemStatus.READY
logger.info("✅ 记忆系统初始化完成")
except Exception as e:
self.status = MemorySystemStatus.ERROR
@@ -546,16 +550,18 @@ class MemorySystem:
return existing_candidates
async def process_conversation_memory(self, context: dict[str, Any]) -> dict[str, Any]:
"""对外暴露的对话记忆处理接口,支持海马体、即时、所有三种采样模式"""
"""对外暴露的对话记忆处理接口,支持海马体、精准记忆、自适应三种采样模式"""
start_time = time.time()
try:
context = dict(context or {})
# 获取配置的采样模式
sampling_mode = getattr(global_config.memory, 'memory_sampling_mode', 'immediate')
sampling_mode = getattr(global_config.memory, 'memory_sampling_mode', 'precision')
current_mode = MemorySamplingMode(sampling_mode)
context['__sampling_mode'] = current_mode.value
logger.debug(f"使用记忆采样模式: {current_mode.value}")
# 根据采样模式处理记忆
@@ -991,7 +997,7 @@ class MemorySystem:
from src.chat.message_receive.chat_stream import get_chat_manager
chat_manager = get_chat_manager()
chat_stream = await chat_manager.get_stream(stream_id)
chat_stream = chat_manager.get_stream(stream_id)
if not chat_stream or not hasattr(chat_stream, "context_manager"):
logger.debug(f"未找到stream_id={stream_id}的聊天流或上下文管理器")
@@ -1105,7 +1111,7 @@ class MemorySystem:
from src.chat.message_receive.chat_stream import get_chat_manager
chat_manager = get_chat_manager()
chat_stream = await chat_manager.get_stream(stream_id)
chat_stream = chat_manager.get_stream(stream_id)
if chat_stream and hasattr(chat_stream, "context_manager"):
history_limit = self._determine_history_limit(context)
messages = chat_stream.context_manager.get_messages(limit=history_limit, include_unread=True)

View File

@@ -9,7 +9,7 @@ import time
from collections import defaultdict
from pathlib import Path
import jieba
import rjieba
import orjson
from pypinyin import Style, pinyin
@@ -56,9 +56,9 @@ class ChineseTypoGenerator:
# 使用内置的词频文件
char_freq = defaultdict(int)
dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")
# 读取jieba的词典文件
# 读取rjieba的词典文件
with open(dict_path, encoding="utf-8") as f:
for line in f:
word, freq = line.strip().split()[:2]
@@ -224,9 +224,9 @@ class ChineseTypoGenerator:
@staticmethod
def _segment_sentence(sentence):
"""
使用jieba分词返回词语列表
使用rjieba分词返回词语列表
"""
return list(jieba.cut(sentence))
return list(rjieba.cut(sentence))
def _get_word_homophones(self, word):
"""
@@ -251,8 +251,8 @@ class ChineseTypoGenerator:
all_combinations = itertools.product(*candidates)
# 获取jieba词典和词频信息
dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
# 获取rjieba词典和词频信息
dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")
valid_words = {} # 改用字典存储词语及其频率
with open(dict_path, encoding="utf-8") as f:
for line in f:

View File

@@ -6,7 +6,7 @@ import time
from collections import Counter
from typing import Any
import jieba
import rjieba
import numpy as np
from maim_message import UserInfo
@@ -440,7 +440,7 @@ def cosine_similarity(v1, v2):
def text_to_vector(text):
"""将文本转换为词频向量"""
# 分词
words = jieba.lcut(text)
words = rjieba.lcut(text)
return Counter(words)

View File

@@ -226,9 +226,9 @@ class ImageManager:
if emotion_result is None:
logger.warning("LLM未能生成情感标签使用详细描述的前几个词")
# 降级处理:从详细描述中提取关键词
import jieba
import rjieba
words = list(jieba.cut(detailed_description))
words = list(rjieba.cut(detailed_description))
emotion_result = "".join(words[:2]) if len(words) >= 2 else (words[0] if words else "表情")
# 处理情感结果取前1-2个最重要的标签