diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 632989c63..4bbdd85c8 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -1,6 +1,7 @@ import math import random import time +import re from collections import Counter from typing import Dict, List @@ -253,7 +254,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: # 统一将英文逗号转换为中文逗号 text = text.replace(',', ',') text = text.replace('\n', ' ') - + text, mapping = protect_kaomoji(text) # print(f"处理前的文本: {text}") text_no_1 = '' @@ -292,6 +293,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: current_sentence += ' ' + part new_sentences.append(current_sentence.strip()) sentences = [s for s in new_sentences if s] # 移除空字符串 + sentences = recover_kaomoji(sentences, mapping) # print(f"分割后的句子: {sentences}") sentences_done = [] @@ -446,3 +448,55 @@ def truncate_message(message: str, max_length=20) -> str: if len(message) > max_length: return message[:max_length] + "..." return message + + +def protect_kaomoji(sentence): + """" + 识别并保护句子中的颜文字(含括号与无括号),将其替换为占位符, + 并返回替换后的句子和占位符到颜文字的映射表。 + Args: + sentence (str): 输入的原始句子 + Returns: + tuple: (处理后的句子, {占位符: 颜文字}) + """ + kaomoji_pattern = re.compile( + r'(' + r'[\(\[(【]' # 左括号 + r'[^()\[\]()【】]*?' # 非括号字符(惰性匹配) + r'[^\u4e00-\u9fa5a-zA-Z0-9\s]' # 非中文、非英文、非数字、非空格字符(必须包含至少一个) + r'[^()\[\]()【】]*?' # 非括号字符(惰性匹配) + r'[\)\])】]' # 右括号 + r')' + r'|' + r'(' + r'[▼▽・ᴥω・﹏^><≧≦ ̄`´∀ヮДд︿﹀へ。゚╥╯╰︶︹•⁄]{2,15}' + r')' + ) + + kaomoji_matches = kaomoji_pattern.findall(sentence) + placeholder_to_kaomoji = {} + + for idx, match in enumerate(kaomoji_matches): + kaomoji = match[0] if match[0] else match[1] + placeholder = f'__KAOMOJI_{idx}__' + sentence = sentence.replace(kaomoji, placeholder, 1) + placeholder_to_kaomoji[placeholder] = kaomoji + + return sentence, placeholder_to_kaomoji + + +def recover_kaomoji(sentences, placeholder_to_kaomoji): + """ + 根据映射表恢复句子中的颜文字。 + Args: + sentences (list): 含有占位符的句子列表 + placeholder_to_kaomoji (dict): 占位符到颜文字的映射表 + Returns: + list: 恢复颜文字后的句子列表 + """ + recovered_sentences = [] + for sentence in sentences: + for placeholder, kaomoji in placeholder_to_kaomoji.items(): + sentence = sentence.replace(placeholder, kaomoji) + recovered_sentences.append(sentence) + return recovered_sentences \ No newline at end of file