refactor(deps): 将jieba分词库替换为rjieba

This commit is contained in:
雅诺狐
2025-10-05 12:08:18 +08:00
committed by Windpicker-owo
parent fee04c0d25
commit 1c9c4884c6
13 changed files with 85 additions and 29 deletions

View File

@@ -9,7 +9,7 @@ import time
from collections import defaultdict
from pathlib import Path
import jieba
import rjieba
import orjson
from pypinyin import Style, pinyin
@@ -56,9 +56,9 @@ class ChineseTypoGenerator:
# 使用内置的词频文件
char_freq = defaultdict(int)
dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")
# 读取jieba的词典文件
# 读取rjieba的词典文件
with open(dict_path, encoding="utf-8") as f:
for line in f:
word, freq = line.strip().split()[:2]
@@ -224,9 +224,9 @@ class ChineseTypoGenerator:
@staticmethod
def _segment_sentence(sentence):
"""
使用jieba分词返回词语列表
使用rjieba分词返回词语列表
"""
return list(jieba.cut(sentence))
return list(rjieba.cut(sentence))
def _get_word_homophones(self, word):
"""
@@ -251,8 +251,8 @@ class ChineseTypoGenerator:
all_combinations = itertools.product(*candidates)
# 获取jieba词典和词频信息
dict_path = os.path.join(os.path.dirname(jieba.__file__), "dict.txt")
# 获取rjieba词典和词频信息
dict_path = os.path.join(os.path.dirname(rjieba.__file__), "dict.txt")
valid_words = {} # 改用字典存储词语及其频率
with open(dict_path, encoding="utf-8") as f:
for line in f:

View File

@@ -6,7 +6,7 @@ import time
from collections import Counter
from typing import Any
import jieba
import rjieba
import numpy as np
from maim_message import UserInfo
@@ -440,7 +440,7 @@ def cosine_similarity(v1, v2):
def text_to_vector(text):
"""将文本转换为词频向量"""
# 分词
words = jieba.lcut(text)
words = rjieba.lcut(text)
return Counter(words)

View File

@@ -226,9 +226,9 @@ class ImageManager:
if emotion_result is None:
logger.warning("LLM未能生成情感标签使用详细描述的前几个词")
# 降级处理:从详细描述中提取关键词
import jieba
import rjieba
words = list(jieba.cut(detailed_description))
words = list(rjieba.cut(detailed_description))
emotion_result = "".join(words[:2]) if len(words) >= 2 else (words[0] if words else "表情")
# 处理情感结果取前1-2个最重要的标签