refactor(deps): 将jieba分词库替换为rjieba

2025-10-05 12:08:18 +08:00
parent 1a68365752
commit 413973979c
13 changed files with 85 additions and 29 deletions
--- a/src/chat/memory_system/memory_system.py
+++ b/src/chat/memory_system/memory_system.py
@@ -19,7 +19,7 @@ from src.chat.memory_system.memory_builder import MemoryBuilder, MemoryExtractio
 from src.chat.memory_system.memory_chunk import MemoryChunk
 from src.chat.memory_system.memory_fusion import MemoryFusionEngine
 from src.chat.memory_system.memory_query_planner import MemoryQueryPlanner
-# 简化的记忆采样模式枚举
+# 记忆采样模式枚举
 class MemorySamplingMode(Enum):
    """记忆采样模式"""
    HIPPOCAMPUS = "hippocampus"  # 海马体模式：定时任务采样
@@ -162,6 +162,7 @@ class MemorySystem:
    async def initialize(self):
        """异步初始化记忆系统"""
        try:
+            logger.info("正在初始化记忆系统...")

            # 初始化LLM模型
            fallback_task = getattr(self.llm_model, "model_for_task", None) if self.llm_model else None
@@ -267,8 +268,11 @@ class MemorySystem:
                    logger.warning(f"海马体采样器初始化失败: {e}")
                    self.hippocampus_sampler = None

+            # 统一存储已经自动加载数据，无需额外加载
+            logger.info("✅ 简化版记忆系统初始化完成")

            self.status = MemorySystemStatus.READY
+            logger.info("✅ 记忆系统初始化完成")

        except Exception as e:
            self.status = MemorySystemStatus.ERROR
@@ -546,16 +550,18 @@ class MemorySystem:
        return existing_candidates

    async def process_conversation_memory(self, context: dict[str, Any]) -> dict[str, Any]:
-        """对外暴露的对话记忆处理接口，支持海马体、即时、所有三种采样模式"""
+        """对外暴露的对话记忆处理接口，支持海马体、精准记忆、自适应三种采样模式"""
        start_time = time.time()

        try:
            context = dict(context or {})

            # 获取配置的采样模式
-            sampling_mode = getattr(global_config.memory, 'memory_sampling_mode', 'immediate')
+            sampling_mode = getattr(global_config.memory, 'memory_sampling_mode', 'precision')
            current_mode = MemorySamplingMode(sampling_mode)

+
+            context['__sampling_mode'] = current_mode.value
            logger.debug(f"使用记忆采样模式: {current_mode.value}")

            # 根据采样模式处理记忆
@@ -991,7 +997,7 @@ class MemorySystem:
            from src.chat.message_receive.chat_stream import get_chat_manager

            chat_manager = get_chat_manager()
-            chat_stream = await chat_manager.get_stream(stream_id)
+            chat_stream = chat_manager.get_stream(stream_id)

            if not chat_stream or not hasattr(chat_stream, "context_manager"):
                logger.debug(f"未找到stream_id={stream_id}的聊天流或上下文管理器")
@@ -1105,7 +1111,7 @@ class MemorySystem:
                from src.chat.message_receive.chat_stream import get_chat_manager

                chat_manager = get_chat_manager()
-                chat_stream = await chat_manager.get_stream(stream_id)
+                chat_stream = chat_manager.get_stream(stream_id)
                if chat_stream and hasattr(chat_stream, "context_manager"):
                    history_limit = self._determine_history_limit(context)
                    messages = chat_stream.context_manager.get_messages(limit=history_limit, include_unread=True)
--- a/src/chat/utils/typo_generator.py
+++ b/src/chat/utils/typo_generator.py
@@ -9,7 +9,7 @@ import time
 from collections import defaultdict
 from pathlib import Path

-import jieba
+import rjieba
 import orjson
 from pypinyin import Style, pinyin

@@ -56,9 +56,9 @@ class ChineseTypoGenerator:

        # 使用内置的词频文件
        char_freq = defaultdict(int)
-        dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
+        dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")

-        # 读取jieba的词典文件
+        # 读取rjieba的词典文件
        with open(dict_path, encoding="utf-8") as f:
            for line in f:
                word, freq = line.strip().split()[:2]
@@ -224,9 +224,9 @@ class ChineseTypoGenerator:
    @staticmethod
    def _segment_sentence(sentence):
        """
-        使用jieba分词，返回词语列表
+        使用rjieba分词，返回词语列表
        """
-        return list(jieba.cut(sentence))
+        return list(rjieba.cut(sentence))

    def _get_word_homophones(self, word):
        """
@@ -251,8 +251,8 @@ class ChineseTypoGenerator:

        all_combinations = itertools.product(*candidates)

-        # 获取jieba词典和词频信息
-        dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
+        # 获取rjieba词典和词频信息
+        dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")
        valid_words = {}  # 改用字典存储词语及其频率
        with open(dict_path, encoding="utf-8") as f:
            for line in f:
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@@ -6,7 +6,7 @@ import time
 from collections import Counter
 from typing import Any

-import jieba
+import rjieba
 import numpy as np
 from maim_message import UserInfo

@@ -440,7 +440,7 @@ def cosine_similarity(v1, v2):
 def text_to_vector(text):
    """将文本转换为词频向量"""
    # 分词
-    words = jieba.lcut(text)
+    words = rjieba.lcut(text)
    return Counter(words)


--- a/src/chat/utils/utils_image.py
+++ b/src/chat/utils/utils_image.py
@@ -226,9 +226,9 @@ class ImageManager:
            if emotion_result is None:
                logger.warning("LLM未能生成情感标签，使用详细描述的前几个词")
                # 降级处理：从详细描述中提取关键词
-                import jieba
+                import rjieba

-                words = list(jieba.cut(detailed_description))
+                words = list(rjieba.cut(detailed_description))
                emotion_result = "，".join(words[:2]) if len(words) >= 2 else (words[0] if words else "表情")

            # 处理情感结果，取前1-2个最重要的标签
--- a/src/common/logger.py
+++ b/src/common/logger.py
@@ -299,7 +299,7 @@ def load_log_config():  # sourcery skip: use-contextlib-suppress
            "peewee",
            "openai",
            "uvicorn",
-            "jieba",
+            "rjieba",
        ],
        "library_log_levels": {"aiohttp": "WARNING"},
    }
--- a/src/person_info/relationship_manager.py
+++ b/src/person_info/relationship_manager.py
@@ -4,7 +4,7 @@ from datetime import datetime
 from difflib import SequenceMatcher
 from typing import Any

-import jieba
+import rjieba
 import orjson
 from json_repair import repair_json
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -535,9 +535,9 @@ class RelationshipManager:
        s1 = str(s1)
        s2 = str(s2)

-        # 1. 使用 jieba 进行分词
-        s1_words = " ".join(jieba.cut(s1))
-        s2_words = " ".join(jieba.cut(s2))
+        # 1. 使用 rjieba 进行分词
+        s1_words = " ".join(rjieba.cut(s1))
+        s2_words = " ".join(rjieba.cut(s2))

        # 2. 将两句话放入一个列表中
        corpus = [s1_words, s2_words]
--- a/src/plugins/built_in/napcat_adapter_plugin/.gitignore
+++ b/src/plugins/built_in/napcat_adapter_plugin/.gitignore
@@ -184,8 +184,8 @@ cython_debug/
 # PyPI configuration file
 .pypirc

-# jieba
-jieba.cache
+# rjieba
+rjieba.cache

 # .vscode
 !.vscode/settings.json