修复西文字符句子错误分割的bug

This commit is contained in:
dax
2025-03-19 10:28:07 +08:00
parent 50d22399e0
commit 61007ffc5e

View File

@@ -255,10 +255,11 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
if not is_western_paragraph(text): if not is_western_paragraph(text):
# 当语言为中文时,统一将英文逗号转换为中文逗号 # 当语言为中文时,统一将英文逗号转换为中文逗号
text = text.replace(',', '') text = text.replace(',', '')
text = text.replace('\n', ' ')
else: else:
# 用"|seg|"作为分割符分开 # 用"|seg|"作为分割符分开
text = re.sub(r'([.!?]) +', r'\1\|seg\|', text) text = re.sub(r'([.!?]) +', r'\1\|seg\|', text)
text = text.replace('\n', ' ') text = text.replace('\n', '\|seg\|')
text, mapping = protect_kaomoji(text) text, mapping = protect_kaomoji(text)
# print(f"处理前的文本: {text}") # print(f"处理前的文本: {text}")
@@ -312,10 +313,12 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
sentences_done = [] sentences_done = []
for sentence in sentences: for sentence in sentences:
sentence = sentence.rstrip(',') sentence = sentence.rstrip(',')
if random.random() < split_strength * 0.5: # 西文字符句子不进行随机合并
sentence = sentence.replace('', '').replace(',', '') if not is_western_paragraph(current_sentence):
elif random.random() < split_strength: if random.random() < split_strength * 0.5:
sentence = sentence.replace('', ' ').replace(',', ' ') sentence = sentence.replace('', '').replace(',', '')
elif random.random() < split_strength:
sentence = sentence.replace('', ' ').replace(',', ' ')
sentences_done.append(sentence) sentences_done.append(sentence)
logger.info(f"处理后的句子: {sentences_done}") logger.info(f"处理后的句子: {sentences_done}")