From 4e73f66dce3e3a84af5b55af9eb4973985a1c170 Mon Sep 17 00:00:00 2001
From: Bakadax <bakadax@qq.com>
Date: Wed, 19 Mar 2025 10:08:38 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E6=96=87=E5=AD=97?=
 =?UTF-8?q?=E7=AC=A6=E5=8F=A5=E5=AD=90=E9=94=99=E8=AF=AF=E5=88=86=E8=A1=8C?=
 =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/plugins/chat/utils.py | 58 +++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py
index 4bbdd85c8..d64a1e59b 100644
--- a/src/plugins/chat/utils.py
+++ b/src/plugins/chat/utils.py
@@ -226,6 +226,13 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li
             who_chat_in_group.append(ChatStream.from_dict(chat_info))
     return who_chat_in_group
 
+def is_western_char(char):
+    """检测是否为西文字符"""
+    return len(char.encode('utf-8')) <= 2
+
+def is_western_paragraph(paragraph):
+    """检测是否为西文字符段落"""
+    return all(is_western_char(char) for char in paragraph if char.isalnum())
 
 def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
     """将文本分割成句子，但保持书名号中的内容完整
@@ -251,8 +258,13 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
 
     # print(f"处理前的文本: {text}")
 
-    # 统一将英文逗号转换为中文逗号
-    text = text.replace(',', '，')
+    # 检查是否为西文字符段落
+    if not is_western_paragraph(text):
+        # 当语言为中文时，统一将英文逗号转换为中文逗号
+        text = text.replace(',', '，')
+    else:
+        # 用"|seg|"作为分割符分开
+        text = re.sub(r'([.!?]) +', r'\1\|seg\|', text)
     text = text.replace('\n', ' ')
     text, mapping = protect_kaomoji(text)
     # print(f"处理前的文本: {text}")
@@ -276,21 +288,29 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
     for sentence in sentences:
         parts = sentence.split('，')
         current_sentence = parts[0]
-        for part in parts[1:]:
-            if random.random() < split_strength:
+        if  not is_western_paragraph(current_sentence):
+            for part in parts[1:]:
+                if random.random() < split_strength:
+                    new_sentences.append(current_sentence.strip())
+                    current_sentence = part
+                else:
+                    current_sentence += '，' + part
+            # 处理空格分割
+            space_parts = current_sentence.split(' ')
+            current_sentence = space_parts[0]
+            for part in space_parts[1:]:
+                if random.random() < split_strength:
+                    new_sentences.append(current_sentence.strip())
+                    current_sentence = part
+                else:
+                    current_sentence += ' ' + part
+        else:
+            # 处理分割符
+            space_parts = current_sentence.split('\|seg\|')
+            current_sentence = space_parts[0]
+            for part in space_parts[1:]:
                 new_sentences.append(current_sentence.strip())
                 current_sentence = part
-            else:
-                current_sentence += '，' + part
-        # 处理空格分割
-        space_parts = current_sentence.split(' ')
-        current_sentence = space_parts[0]
-        for part in space_parts[1:]:
-            if random.random() < split_strength:
-                new_sentences.append(current_sentence.strip())
-                current_sentence = part
-            else:
-                current_sentence += ' ' + part
         new_sentences.append(current_sentence.strip())
     sentences = [s for s in new_sentences if s]  # 移除空字符串
     sentences = recover_kaomoji(sentences, mapping)
@@ -338,7 +358,11 @@ def random_remove_punctuation(text: str) -> str:
 
 def process_llm_response(text: str) -> List[str]:
     # processed_response = process_text_with_typos(content)
-    if len(text) > 100:
+    # 对西文字符段落的回复长度设置为汉字字符的两倍
+    if len(text) > 100 and  not is_western_paragraph(text) :
+        logger.warning(f"回复过长 ({len(text)} 字符)，返回默认回复")
+        return ['懒得说']
+    elif len(text) > 200 :
         logger.warning(f"回复过长 ({len(text)} 字符)，返回默认回复")
         return ['懒得说']
     # 处理长消息
@@ -499,4 +523,4 @@ def recover_kaomoji(sentences, placeholder_to_kaomoji):
         for placeholder, kaomoji in placeholder_to_kaomoji.items():
             sentence = sentence.replace(placeholder, kaomoji)
         recovered_sentences.append(sentence)
-    return recovered_sentences
\ No newline at end of file
+    return recovered_sentences