refactor(nlp): 将jieba替换为rjieba进行中文标记化

This commit is contained in:
Windpicker-owo
2025-10-31 21:41:02 +08:00
parent e46d9529f5
commit df22ff91cc
2 changed files with 11 additions and 11 deletions

View File

@@ -123,13 +123,13 @@ def extract_keywords(text: str, max_keywords: int = 10) -> list[str]:
return [] return []
try: try:
import jieba.analyse import rjieba.analyse
# 使用TF-IDF提取关键词 # 使用TF-IDF提取关键词
keywords = jieba.analyse.extract_tags(text, topK=max_keywords) keywords = rjieba.analyse.extract_tags(text, topK=max_keywords)
return keywords return keywords
except ImportError: except ImportError:
logger.warning("jieba未安装无法提取关键词") logger.warning("rjieba未安装无法提取关键词")
# 简单分词 # 简单分词
words = text.split() words = text.split()
return words[:max_keywords] return words[:max_keywords]

View File

@@ -21,12 +21,12 @@ class Tokenizer:
if use_jieba: if use_jieba:
try: try:
import jieba import rjieba
jieba.initialize() # rjieba 会自动初始化,无需手动调用
logger.info("Jieba分词器初始化成功") logger.info("RJieba分词器初始化成功")
except ImportError: except ImportError:
logger.warning("Jieba未安装将使用字符级分词") logger.warning("RJieba未安装将使用字符级分词")
self.use_jieba = False self.use_jieba = False
def tokenize(self, text: str) -> list[str]: def tokenize(self, text: str) -> list[str]:
@@ -42,14 +42,14 @@ class Tokenizer:
if not text: if not text:
return [] return []
# 使用jieba分词 # 使用rjieba分词
if self.use_jieba: if self.use_jieba:
try: try:
import jieba import rjieba
tokens = list(jieba.cut(text)) tokens = list(rjieba.cut(text))
except Exception as e: except Exception as e:
logger.warning(f"Jieba分词失败使用字符级分词: {e}") logger.warning(f"RJieba分词失败使用字符级分词: {e}")
tokens = list(text) tokens = list(text)
else: else:
# 简单按字符分词 # 简单按字符分词