From 4e73f66dce3e3a84af5b55af9eb4973985a1c170 Mon Sep 17 00:00:00 2001 From: Bakadax Date: Wed, 19 Mar 2025 10:08:38 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E6=96=87=E5=AD=97?= =?UTF-8?q?=E7=AC=A6=E5=8F=A5=E5=AD=90=E9=94=99=E8=AF=AF=E5=88=86=E8=A1=8C?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/chat/utils.py | 58 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 4bbdd85c8..d64a1e59b 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -226,6 +226,13 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li who_chat_in_group.append(ChatStream.from_dict(chat_info)) return who_chat_in_group +def is_western_char(char): + """检测是否为西文字符""" + return len(char.encode('utf-8')) <= 2 + +def is_western_paragraph(paragraph): + """检测是否为西文字符段落""" + return all(is_western_char(char) for char in paragraph if char.isalnum()) def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: """将文本分割成句子,但保持书名号中的内容完整 @@ -251,8 +258,13 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: # print(f"处理前的文本: {text}") - # 统一将英文逗号转换为中文逗号 - text = text.replace(',', ',') + # 检查是否为西文字符段落 + if not is_western_paragraph(text): + # 当语言为中文时,统一将英文逗号转换为中文逗号 + text = text.replace(',', ',') + else: + # 用"|seg|"作为分割符分开 + text = re.sub(r'([.!?]) +', r'\1\|seg\|', text) text = text.replace('\n', ' ') text, mapping = protect_kaomoji(text) # print(f"处理前的文本: {text}") @@ -276,21 +288,29 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: for sentence in sentences: parts = sentence.split(',') current_sentence = parts[0] - for part in parts[1:]: - if random.random() < split_strength: + if not is_western_paragraph(current_sentence): + for part in parts[1:]: + if random.random() < split_strength: + new_sentences.append(current_sentence.strip()) + current_sentence = part + else: + current_sentence += ',' + part + # 处理空格分割 + space_parts = current_sentence.split(' ') + current_sentence = space_parts[0] + for part in space_parts[1:]: + if random.random() < split_strength: + new_sentences.append(current_sentence.strip()) + current_sentence = part + else: + current_sentence += ' ' + part + else: + # 处理分割符 + space_parts = current_sentence.split('\|seg\|') + current_sentence = space_parts[0] + for part in space_parts[1:]: new_sentences.append(current_sentence.strip()) current_sentence = part - else: - current_sentence += ',' + part - # 处理空格分割 - space_parts = current_sentence.split(' ') - current_sentence = space_parts[0] - for part in space_parts[1:]: - if random.random() < split_strength: - new_sentences.append(current_sentence.strip()) - current_sentence = part - else: - current_sentence += ' ' + part new_sentences.append(current_sentence.strip()) sentences = [s for s in new_sentences if s] # 移除空字符串 sentences = recover_kaomoji(sentences, mapping) @@ -338,7 +358,11 @@ def random_remove_punctuation(text: str) -> str: def process_llm_response(text: str) -> List[str]: # processed_response = process_text_with_typos(content) - if len(text) > 100: + # 对西文字符段落的回复长度设置为汉字字符的两倍 + if len(text) > 100 and not is_western_paragraph(text) : + logger.warning(f"回复过长 ({len(text)} 字符),返回默认回复") + return ['懒得说'] + elif len(text) > 200 : logger.warning(f"回复过长 ({len(text)} 字符),返回默认回复") return ['懒得说'] # 处理长消息 @@ -499,4 +523,4 @@ def recover_kaomoji(sentences, placeholder_to_kaomoji): for placeholder, kaomoji in placeholder_to_kaomoji.items(): sentence = sentence.replace(placeholder, kaomoji) recovered_sentences.append(sentence) - return recovered_sentences \ No newline at end of file + return recovered_sentences