修复西文字符句子错误分行的问题

2025-03-19 10:08:38 +08:00
parent b721bd63bb
commit 4e73f66dce
1 changed files with 41 additions and 17 deletions
--- a/src/plugins/chat/utils.py
+++ b/src/plugins/chat/utils.py
@@ -226,6 +226,13 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li
            who_chat_in_group.append(ChatStream.from_dict(chat_info))
    return who_chat_in_group

+def is_western_char(char):
+    """检测是否为西文字符"""
+    return len(char.encode('utf-8')) <= 2
+
+def is_western_paragraph(paragraph):
+    """检测是否为西文字符段落"""
+    return all(is_western_char(char) for char in paragraph if char.isalnum())

 def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
    """将文本分割成句子，但保持书名号中的内容完整
@@ -251,8 +258,13 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:

    # print(f"处理前的文本: {text}")

-    # 统一将英文逗号转换为中文逗号
-    text = text.replace(',', '，')
+    # 检查是否为西文字符段落
+    if not is_western_paragraph(text):
+        # 当语言为中文时，统一将英文逗号转换为中文逗号
+        text = text.replace(',', '，')
+    else:
+        # 用"|seg|"作为分割符分开
+        text = re.sub(r'([.!?]) +', r'\1\|seg\|', text)
    text = text.replace('\n', ' ')
    text, mapping = protect_kaomoji(text)
    # print(f"处理前的文本: {text}")
@@ -276,21 +288,29 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
    for sentence in sentences:
        parts = sentence.split('，')
        current_sentence = parts[0]
-        for part in parts[1:]:
-            if random.random() < split_strength:
+        if  not is_western_paragraph(current_sentence):
+            for part in parts[1:]:
+                if random.random() < split_strength:
+                    new_sentences.append(current_sentence.strip())
+                    current_sentence = part
+                else:
+                    current_sentence += '，' + part
+            # 处理空格分割
+            space_parts = current_sentence.split(' ')
+            current_sentence = space_parts[0]
+            for part in space_parts[1:]:
+                if random.random() < split_strength:
+                    new_sentences.append(current_sentence.strip())
+                    current_sentence = part
+                else:
+                    current_sentence += ' ' + part
+        else:
+            # 处理分割符
+            space_parts = current_sentence.split('\|seg\|')
+            current_sentence = space_parts[0]
+            for part in space_parts[1:]:
                new_sentences.append(current_sentence.strip())
                current_sentence = part
-            else:
-                current_sentence += '，' + part
-        # 处理空格分割
-        space_parts = current_sentence.split(' ')
-        current_sentence = space_parts[0]
-        for part in space_parts[1:]:
-            if random.random() < split_strength:
-                new_sentences.append(current_sentence.strip())
-                current_sentence = part
-            else:
-                current_sentence += ' ' + part
        new_sentences.append(current_sentence.strip())
    sentences = [s for s in new_sentences if s]  # 移除空字符串
    sentences = recover_kaomoji(sentences, mapping)
@@ -338,7 +358,11 @@ def random_remove_punctuation(text: str) -> str:

 def process_llm_response(text: str) -> List[str]:
    # processed_response = process_text_with_typos(content)
-    if len(text) > 100:
+    # 对西文字符段落的回复长度设置为汉字字符的两倍
+    if len(text) > 100 and  not is_western_paragraph(text) :
+        logger.warning(f"回复过长 ({len(text)} 字符)，返回默认回复")
+        return ['懒得说']
+    elif len(text) > 200 :
        logger.warning(f"回复过长 ({len(text)} 字符)，返回默认回复")
        return ['懒得说']
    # 处理长消息