From df22ff91cc646dcc0e76ae528a56ea5554d435a0 Mon Sep 17 00:00:00 2001 From: Windpicker-owo <3431391539@qq.com> Date: Fri, 31 Oct 2025 21:41:02 +0800 Subject: [PATCH] =?UTF-8?q?refactor(nlp):=20=E5=B0=86jieba=E6=9B=BF?= =?UTF-8?q?=E6=8D=A2=E4=B8=BArjieba=E8=BF=9B=E8=A1=8C=E4=B8=AD=E6=96=87?= =?UTF-8?q?=E6=A0=87=E8=AE=B0=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/express/express_utils.py | 6 +++--- src/chat/express/expressor_model/tokenizer.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/chat/express/express_utils.py b/src/chat/express/express_utils.py index 0d1baded1..96c175648 100644 --- a/src/chat/express/express_utils.py +++ b/src/chat/express/express_utils.py @@ -123,13 +123,13 @@ def extract_keywords(text: str, max_keywords: int = 10) -> list[str]: return [] try: - import jieba.analyse + import rjieba.analyse # 使用TF-IDF提取关键词 - keywords = jieba.analyse.extract_tags(text, topK=max_keywords) + keywords = rjieba.analyse.extract_tags(text, topK=max_keywords) return keywords except ImportError: - logger.warning("jieba未安装,无法提取关键词") + logger.warning("rjieba未安装,无法提取关键词") # 简单分词 words = text.split() return words[:max_keywords] diff --git a/src/chat/express/expressor_model/tokenizer.py b/src/chat/express/expressor_model/tokenizer.py index b12cdc713..0e942e2dc 100644 --- a/src/chat/express/expressor_model/tokenizer.py +++ b/src/chat/express/expressor_model/tokenizer.py @@ -21,12 +21,12 @@ class Tokenizer: if use_jieba: try: - import jieba + import rjieba - jieba.initialize() - logger.info("Jieba分词器初始化成功") + # rjieba 会自动初始化,无需手动调用 + logger.info("RJieba分词器初始化成功") except ImportError: - logger.warning("Jieba未安装,将使用字符级分词") + logger.warning("RJieba未安装,将使用字符级分词") self.use_jieba = False def tokenize(self, text: str) -> list[str]: @@ -42,14 +42,14 @@ class Tokenizer: if not text: return [] - # 使用jieba分词 + # 使用rjieba分词 if self.use_jieba: try: - import jieba + import rjieba - tokens = list(jieba.cut(text)) + tokens = list(rjieba.cut(text)) except Exception as e: - logger.warning(f"Jieba分词失败,使用字符级分词: {e}") + logger.warning(f"RJieba分词失败,使用字符级分词: {e}") tokens = list(text) else: # 简单按字符分词