refactor(deps): 将jieba分词库替换为rjieba

2025-10-05 12:08:18 +08:00
parent fee04c0d25
commit 1c9c4884c6
13 changed files with 85 additions and 29 deletions
--- a/src/chat/utils/typo_generator.py
+++ b/src/chat/utils/typo_generator.py
@@ -9,7 +9,7 @@ import time
 from collections import defaultdict
 from pathlib import Path

-import jieba
+import rjieba
 import orjson
 from pypinyin import Style, pinyin

@@ -56,9 +56,9 @@ class ChineseTypoGenerator:

        # 使用内置的词频文件
        char_freq = defaultdict(int)
-        dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
+        dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")

-        # 读取jieba的词典文件
+        # 读取rjieba的词典文件
        with open(dict_path, encoding="utf-8") as f:
            for line in f:
                word, freq = line.strip().split()[:2]
@@ -224,9 +224,9 @@ class ChineseTypoGenerator:
    @staticmethod
    def _segment_sentence(sentence):
        """
-        使用jieba分词，返回词语列表
+        使用rjieba分词，返回词语列表
        """
-        return list(jieba.cut(sentence))
+        return list(rjieba.cut(sentence))

    def _get_word_homophones(self, word):
        """
@@ -251,8 +251,8 @@ class ChineseTypoGenerator:

        all_combinations = itertools.product(*candidates)

-        # 获取jieba词典和词频信息
-        dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
+        # 获取rjieba词典和词频信息
+        dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")
        valid_words = {}  # 改用字典存储词语及其频率
        with open(dict_path, encoding="utf-8") as f:
            for line in f:
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@@ -6,7 +6,7 @@ import time
 from collections import Counter
 from typing import Any

-import jieba
+import rjieba
 import numpy as np
 from maim_message import UserInfo

@@ -440,7 +440,7 @@ def cosine_similarity(v1, v2):
 def text_to_vector(text):
    """将文本转换为词频向量"""
    # 分词
-    words = jieba.lcut(text)
+    words = rjieba.lcut(text)
    return Counter(words)


--- a/src/chat/utils/utils_image.py
+++ b/src/chat/utils/utils_image.py
@@ -226,9 +226,9 @@ class ImageManager:
            if emotion_result is None:
                logger.warning("LLM未能生成情感标签，使用详细描述的前几个词")
                # 降级处理：从详细描述中提取关键词
-                import jieba
+                import rjieba

-                words = list(jieba.cut(detailed_description))
+                words = list(rjieba.cut(detailed_description))
                emotion_result = "，".join(words[:2]) if len(words) >= 2 else (words[0] if words else "表情")

            # 处理情感结果，取前1-2个最重要的标签