From 61007ffc5e2d648a7990689502c055bc68cea6c7 Mon Sep 17 00:00:00 2001 From: dax <88696221+Dax233@users.noreply.github.com> Date: Wed, 19 Mar 2025 10:28:07 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E6=96=87=E5=AD=97?= =?UTF-8?q?=E7=AC=A6=E5=8F=A5=E5=AD=90=E9=94=99=E8=AF=AF=E5=88=86=E5=89=B2?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/chat/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 652dec4f9..47014d1c1 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -255,10 +255,11 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: if not is_western_paragraph(text): # 当语言为中文时,统一将英文逗号转换为中文逗号 text = text.replace(',', ',') + text = text.replace('\n', ' ') else: # 用"|seg|"作为分割符分开 text = re.sub(r'([.!?]) +', r'\1\|seg\|', text) - text = text.replace('\n', ' ') + text = text.replace('\n', '\|seg\|') text, mapping = protect_kaomoji(text) # print(f"处理前的文本: {text}") @@ -312,10 +313,12 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: sentences_done = [] for sentence in sentences: sentence = sentence.rstrip(',,') - if random.random() < split_strength * 0.5: - sentence = sentence.replace(',', '').replace(',', '') - elif random.random() < split_strength: - sentence = sentence.replace(',', ' ').replace(',', ' ') + # 西文字符句子不进行随机合并 + if not is_western_paragraph(current_sentence): + if random.random() < split_strength * 0.5: + sentence = sentence.replace(',', '').replace(',', '') + elif random.random() < split_strength: + sentence = sentence.replace(',', ' ').replace(',', ' ') sentences_done.append(sentence) logger.info(f"处理后的句子: {sentences_done}")