From 4e73f66dce3e3a84af5b55af9eb4973985a1c170 Mon Sep 17 00:00:00 2001 From: Bakadax Date: Wed, 19 Mar 2025 10:08:38 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E6=96=87?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E5=8F=A5=E5=AD=90=E9=94=99=E8=AF=AF=E5=88=86?= =?UTF-8?q?=E8=A1=8C=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/chat/utils.py | 58 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 4bbdd85c8..d64a1e59b 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -226,6 +226,13 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li who_chat_in_group.append(ChatStream.from_dict(chat_info)) return who_chat_in_group +def is_western_char(char): + """检测是否为西文字符""" + return len(char.encode('utf-8')) <= 2 + +def is_western_paragraph(paragraph): + """检测是否为西文字符段落""" + return all(is_western_char(char) for char in paragraph if char.isalnum()) def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: """将文本分割成句子,但保持书名号中的内容完整 @@ -251,8 +258,13 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: # print(f"处理前的文本: {text}") - # 统一将英文逗号转换为中文逗号 - text = text.replace(',', ',') + # 检查是否为西文字符段落 + if not is_western_paragraph(text): + # 当语言为中文时,统一将英文逗号转换为中文逗号 + text = text.replace(',', ',') + else: + # 用"|seg|"作为分割符分开 + text = re.sub(r'([.!?]) +', r'\1\|seg\|', text) text = text.replace('\n', ' ') text, mapping = protect_kaomoji(text) # print(f"处理前的文本: {text}") @@ -276,21 +288,29 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: for sentence in sentences: parts = sentence.split(',') current_sentence = parts[0] - for part in parts[1:]: - if random.random() < split_strength: + if not is_western_paragraph(current_sentence): + for part in parts[1:]: + if random.random() < split_strength: + new_sentences.append(current_sentence.strip()) + current_sentence = part + else: + current_sentence += ',' + part + # 处理空格分割 + space_parts = current_sentence.split(' ') + current_sentence = space_parts[0] + for part in space_parts[1:]: + if random.random() < split_strength: + new_sentences.append(current_sentence.strip()) + current_sentence = part + else: + current_sentence += ' ' + part + else: + # 处理分割符 + space_parts = current_sentence.split('\|seg\|') + current_sentence = space_parts[0] + for part in space_parts[1:]: new_sentences.append(current_sentence.strip()) current_sentence = part - else: - current_sentence += ',' + part - # 处理空格分割 - space_parts = current_sentence.split(' ') - current_sentence = space_parts[0] - for part in space_parts[1:]: - if random.random() < split_strength: - new_sentences.append(current_sentence.strip()) - current_sentence = part - else: - current_sentence += ' ' + part new_sentences.append(current_sentence.strip()) sentences = [s for s in new_sentences if s] # 移除空字符串 sentences = recover_kaomoji(sentences, mapping) @@ -338,7 +358,11 @@ def random_remove_punctuation(text: str) -> str: def process_llm_response(text: str) -> List[str]: # processed_response = process_text_with_typos(content) - if len(text) > 100: + # 对西文字符段落的回复长度设置为汉字字符的两倍 + if len(text) > 100 and not is_western_paragraph(text) : + logger.warning(f"回复过长 ({len(text)} 字符),返回默认回复") + return ['懒得说'] + elif len(text) > 200 : logger.warning(f"回复过长 ({len(text)} 字符),返回默认回复") return ['懒得说'] # 处理长消息 @@ -499,4 +523,4 @@ def recover_kaomoji(sentences, placeholder_to_kaomoji): for placeholder, kaomoji in placeholder_to_kaomoji.items(): sentence = sentence.replace(placeholder, kaomoji) recovered_sentences.append(sentence) - return recovered_sentences \ No newline at end of file + return recovered_sentences From 50d22399e08f5b586432aea7fb0d9c7a891a5918 Mon Sep 17 00:00:00 2001 From: dax <88696221+Dax233@users.noreply.github.com> Date: Wed, 19 Mar 2025 10:15:12 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E6=96=87?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E9=94=99=E8=AF=AF=E5=88=86=E8=A1=8C=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/chat/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index d64a1e59b..652dec4f9 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -226,13 +226,6 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li who_chat_in_group.append(ChatStream.from_dict(chat_info)) return who_chat_in_group -def is_western_char(char): - """检测是否为西文字符""" - return len(char.encode('utf-8')) <= 2 - -def is_western_paragraph(paragraph): - """检测是否为西文字符段落""" - return all(is_western_char(char) for char in paragraph if char.isalnum()) def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: """将文本分割成句子,但保持书名号中的内容完整 @@ -524,3 +517,11 @@ def recover_kaomoji(sentences, placeholder_to_kaomoji): sentence = sentence.replace(placeholder, kaomoji) recovered_sentences.append(sentence) return recovered_sentences + +def is_western_char(char): + """检测是否为西文字符""" + return len(char.encode('utf-8')) <= 2 + +def is_western_paragraph(paragraph): + """检测是否为西文字符段落""" + return all(is_western_char(char) for char in paragraph if char.isalnum()) From 61007ffc5e2d648a7990689502c055bc68cea6c7 Mon Sep 17 00:00:00 2001 From: dax <88696221+Dax233@users.noreply.github.com> Date: Wed, 19 Mar 2025 10:28:07 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E6=96=87?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E5=8F=A5=E5=AD=90=E9=94=99=E8=AF=AF=E5=88=86?= =?UTF-8?q?=E5=89=B2=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/chat/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 652dec4f9..47014d1c1 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -255,10 +255,11 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: if not is_western_paragraph(text): # 当语言为中文时,统一将英文逗号转换为中文逗号 text = text.replace(',', ',') + text = text.replace('\n', ' ') else: # 用"|seg|"作为分割符分开 text = re.sub(r'([.!?]) +', r'\1\|seg\|', text) - text = text.replace('\n', ' ') + text = text.replace('\n', '\|seg\|') text, mapping = protect_kaomoji(text) # print(f"处理前的文本: {text}") @@ -312,10 +313,12 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: sentences_done = [] for sentence in sentences: sentence = sentence.rstrip(',,') - if random.random() < split_strength * 0.5: - sentence = sentence.replace(',', '').replace(',', '') - elif random.random() < split_strength: - sentence = sentence.replace(',', ' ').replace(',', ' ') + # 西文字符句子不进行随机合并 + if not is_western_paragraph(current_sentence): + if random.random() < split_strength * 0.5: + sentence = sentence.replace(',', '').replace(',', '') + elif random.random() < split_strength: + sentence = sentence.replace(',', ' ').replace(',', ' ') sentences_done.append(sentence) logger.info(f"处理后的句子: {sentences_done}") From 65c26af25b4a436fe131fab9c0147ac46bf1616d Mon Sep 17 00:00:00 2001 From: dax <88696221+Dax233@users.noreply.github.com> Date: Wed, 19 Mar 2025 18:54:02 +0800 Subject: [PATCH 4/4] modified: src/plugins/chat/utils.py --- src/plugins/chat/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 47014d1c1..8f2f006f7 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -355,7 +355,7 @@ def random_remove_punctuation(text: str) -> str: def process_llm_response(text: str) -> List[str]: # processed_response = process_text_with_typos(content) # 对西文字符段落的回复长度设置为汉字字符的两倍 - if len(text) > 100 and not is_western_paragraph(text) : + if len(text) > 100 and not is_western_paragraph(text) : logger.warning(f"回复过长 ({len(text)} 字符),返回默认回复") return ['懒得说'] elif len(text) > 200 :