修复西文字符句子错误分割的bug
This commit is contained in:
@@ -255,10 +255,11 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
|||||||
if not is_western_paragraph(text):
|
if not is_western_paragraph(text):
|
||||||
# 当语言为中文时,统一将英文逗号转换为中文逗号
|
# 当语言为中文时,统一将英文逗号转换为中文逗号
|
||||||
text = text.replace(',', ',')
|
text = text.replace(',', ',')
|
||||||
|
text = text.replace('\n', ' ')
|
||||||
else:
|
else:
|
||||||
# 用"|seg|"作为分割符分开
|
# 用"|seg|"作为分割符分开
|
||||||
text = re.sub(r'([.!?]) +', r'\1\|seg\|', text)
|
text = re.sub(r'([.!?]) +', r'\1\|seg\|', text)
|
||||||
text = text.replace('\n', ' ')
|
text = text.replace('\n', '\|seg\|')
|
||||||
text, mapping = protect_kaomoji(text)
|
text, mapping = protect_kaomoji(text)
|
||||||
# print(f"处理前的文本: {text}")
|
# print(f"处理前的文本: {text}")
|
||||||
|
|
||||||
@@ -312,6 +313,8 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
|||||||
sentences_done = []
|
sentences_done = []
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
sentence = sentence.rstrip(',,')
|
sentence = sentence.rstrip(',,')
|
||||||
|
# 西文字符句子不进行随机合并
|
||||||
|
if not is_western_paragraph(current_sentence):
|
||||||
if random.random() < split_strength * 0.5:
|
if random.random() < split_strength * 0.5:
|
||||||
sentence = sentence.replace(',', '').replace(',', '')
|
sentence = sentence.replace(',', '').replace(',', '')
|
||||||
elif random.random() < split_strength:
|
elif random.random() < split_strength:
|
||||||
|
|||||||
Reference in New Issue
Block a user