修复西文字符句子错误分割的bug
This commit is contained in:
@@ -255,10 +255,11 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
||||
if not is_western_paragraph(text):
|
||||
# 当语言为中文时,统一将英文逗号转换为中文逗号
|
||||
text = text.replace(',', ',')
|
||||
text = text.replace('\n', ' ')
|
||||
else:
|
||||
# 用"|seg|"作为分割符分开
|
||||
text = re.sub(r'([.!?]) +', r'\1\|seg\|', text)
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\n', '\|seg\|')
|
||||
text, mapping = protect_kaomoji(text)
|
||||
# print(f"处理前的文本: {text}")
|
||||
|
||||
@@ -312,6 +313,8 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
||||
sentences_done = []
|
||||
for sentence in sentences:
|
||||
sentence = sentence.rstrip(',,')
|
||||
# 西文字符句子不进行随机合并
|
||||
if not is_western_paragraph(current_sentence):
|
||||
if random.random() < split_strength * 0.5:
|
||||
sentence = sentence.replace(',', '').replace(',', '')
|
||||
elif random.random() < split_strength:
|
||||
|
||||
Reference in New Issue
Block a user