refactor(nlp): 将jieba替换为rjieba进行中文标记化
This commit is contained in:
@@ -123,13 +123,13 @@ def extract_keywords(text: str, max_keywords: int = 10) -> list[str]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import jieba.analyse
|
import rjieba.analyse
|
||||||
|
|
||||||
# 使用TF-IDF提取关键词
|
# 使用TF-IDF提取关键词
|
||||||
keywords = jieba.analyse.extract_tags(text, topK=max_keywords)
|
keywords = rjieba.analyse.extract_tags(text, topK=max_keywords)
|
||||||
return keywords
|
return keywords
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("jieba未安装,无法提取关键词")
|
logger.warning("rjieba未安装,无法提取关键词")
|
||||||
# 简单分词
|
# 简单分词
|
||||||
words = text.split()
|
words = text.split()
|
||||||
return words[:max_keywords]
|
return words[:max_keywords]
|
||||||
|
|||||||
@@ -21,12 +21,12 @@ class Tokenizer:
|
|||||||
|
|
||||||
if use_jieba:
|
if use_jieba:
|
||||||
try:
|
try:
|
||||||
import jieba
|
import rjieba
|
||||||
|
|
||||||
jieba.initialize()
|
# rjieba 会自动初始化,无需手动调用
|
||||||
logger.info("Jieba分词器初始化成功")
|
logger.info("RJieba分词器初始化成功")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("Jieba未安装,将使用字符级分词")
|
logger.warning("RJieba未安装,将使用字符级分词")
|
||||||
self.use_jieba = False
|
self.use_jieba = False
|
||||||
|
|
||||||
def tokenize(self, text: str) -> list[str]:
|
def tokenize(self, text: str) -> list[str]:
|
||||||
@@ -42,14 +42,14 @@ class Tokenizer:
|
|||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# 使用jieba分词
|
# 使用rjieba分词
|
||||||
if self.use_jieba:
|
if self.use_jieba:
|
||||||
try:
|
try:
|
||||||
import jieba
|
import rjieba
|
||||||
|
|
||||||
tokens = list(jieba.cut(text))
|
tokens = list(rjieba.cut(text))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Jieba分词失败,使用字符级分词: {e}")
|
logger.warning(f"RJieba分词失败,使用字符级分词: {e}")
|
||||||
tokens = list(text)
|
tokens = list(text)
|
||||||
else:
|
else:
|
||||||
# 简单按字符分词
|
# 简单按字符分词
|
||||||
|
|||||||
Reference in New Issue
Block a user