Merge pull request #473 from Ark-Hakobune/main-fix

颜文字分割问题fix
2025-03-18 17:57:31 +08:00
parent aa982d2738 9ed115dd15
commit 2da2769e44
1 changed files with 55 additions and 1 deletions
--- a/src/plugins/chat/utils.py
+++ b/src/plugins/chat/utils.py
@@ -1,6 +1,7 @@
 import math
 import random
 import time
+import re
 from collections import Counter
 from typing import Dict, List

@@ -253,7 +254,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
    # 统一将英文逗号转换为中文逗号
    text = text.replace(',', '，')
    text = text.replace('\n', ' ')
-
+    text, mapping = protect_kaomoji(text)
    # print(f"处理前的文本: {text}")

    text_no_1 = ''
@@ -292,6 +293,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
                current_sentence += ' ' + part
        new_sentences.append(current_sentence.strip())
    sentences = [s for s in new_sentences if s]  # 移除空字符串
+    sentences = recover_kaomoji(sentences, mapping)

    # print(f"分割后的句子: {sentences}")
    sentences_done = []
@@ -446,3 +448,55 @@ def truncate_message(message: str, max_length=20) -> str:
    if len(message) > max_length:
        return message[:max_length] + "..."
    return message
+
+
+def protect_kaomoji(sentence):
+    """"
+    识别并保护句子中的颜文字（含括号与无括号），将其替换为占位符，
+    并返回替换后的句子和占位符到颜文字的映射表。
+    Args:
+        sentence (str): 输入的原始句子
+    Returns:
+        tuple: (处理后的句子, {占位符: 颜文字})
+    """
+    kaomoji_pattern = re.compile(
+        r'('
+        r'[\(\[（【]'             # 左括号
+        r'[^()\[\]（）【】]*?'   # 非括号字符（惰性匹配）
+        r'[^\u4e00-\u9fa5a-zA-Z0-9\s]'  # 非中文、非英文、非数字、非空格字符（必须包含至少一个）
+        r'[^()\[\]（）【】]*?'   # 非括号字符（惰性匹配）
+        r'[\)\]）】]'             # 右括号
+        r')'
+        r'|'
+        r'('
+        r'[▼▽・ᴥω･﹏^><≧≦￣｀´∀ヮДд︿﹀へ｡ﾟ╥╯╰︶︹•⁄]{2,15}'
+        r')'
+    )
+
+    kaomoji_matches = kaomoji_pattern.findall(sentence)
+    placeholder_to_kaomoji = {}
+
+    for idx, match in enumerate(kaomoji_matches):
+        kaomoji = match[0] if match[0] else match[1]
+        placeholder = f'__KAOMOJI_{idx}__'
+        sentence = sentence.replace(kaomoji, placeholder, 1)
+        placeholder_to_kaomoji[placeholder] = kaomoji
+
+    return sentence, placeholder_to_kaomoji
+
+
+def recover_kaomoji(sentences, placeholder_to_kaomoji):
+    """
+    根据映射表恢复句子中的颜文字。
+    Args:
+        sentences (list): 含有占位符的句子列表
+        placeholder_to_kaomoji (dict): 占位符到颜文字的映射表
+    Returns:
+        list: 恢复颜文字后的句子列表
+    """
+    recovered_sentences = []
+    for sentence in sentences:
+        for placeholder, kaomoji in placeholder_to_kaomoji.items():
+            sentence = sentence.replace(placeholder, kaomoji)
+        recovered_sentences.append(sentence)
+    return recovered_sentences