拆分0.1.3

1
This commit is contained in:
SengokuCola
2025-02-26 22:18:44 +08:00
parent 94436c148b
commit b88f200f74
6 changed files with 512 additions and 617 deletions

View File

@@ -1,4 +1,5 @@
import time
import random
from typing import List
from .message import Message
import requests
@@ -113,3 +114,180 @@ def get_recent_group_messages(db, group_id: int, limit: int = 12) -> list:
# 按时间正序排列
message_objects.reverse()
return message_objects
def split_into_sentences(text: str) -> List[str]:
"""将文本分割成句子,但保持书名号中的内容完整
Args:
text: 要分割的文本字符串
Returns:
List[str]: 分割后的句子列表
"""
delimiters = ['', '', '', ',', '', '', '!', '?', '\n'] # 添加换行符作为分隔符
remove_chars = ['', ','] # 只移除这两种逗号
sentences = []
current_sentence = ""
in_book_title = False # 标记是否在书名号内
for char in text:
current_sentence += char
# 检查书名号
if char == '':
in_book_title = True
elif char == '':
in_book_title = False
# 只有不在书名号内且是分隔符时才分割
if char in delimiters and not in_book_title:
if current_sentence.strip(): # 确保不是空字符串
# 只移除逗号
clean_sentence = current_sentence
if clean_sentence[-1] in remove_chars:
clean_sentence = clean_sentence[:-1]
if clean_sentence.strip():
sentences.append(clean_sentence.strip())
current_sentence = ""
# 处理最后一个句子
if current_sentence.strip():
# 如果最后一个字符是逗号,移除它
if current_sentence[-1] in remove_chars:
current_sentence = current_sentence[:-1]
sentences.append(current_sentence.strip())
# 过滤掉空字符串
sentences = [s for s in sentences if s.strip()]
return sentences
# 常见的错别字映射
TYPO_DICT = {
'': '地得',
'': '咯啦勒',
'': '嘛麻',
'': '八把罢',
'': '',
'': '再在',
'': '',
'': '',
'': '沃窝喔',
'': '泥尼拟',
'': '它她塔祂',
'': '',
'': '阿哇',
'': '呐捏',
'': '豆读毒',
'': '',
'': '回汇',
'': '趣取曲',
'': '作坐',
'': '相像',
'': '说税睡',
'': '砍堪刊',
'': '来莱赖',
'': '号毫豪',
'': '给既继',
'': '锅果裹',
'': '',
'': '位未',
'': '甚深伸',
'': '末麽嘛',
'': '话花划',
'': '织直值',
'': '',
'': '听停挺',
'': '见件建',
'': '觉脚搅',
'': '得德锝',
'': '着找招',
'': '向象想',
'': '等灯登',
'': '谢写卸',
'': '对队',
'': '里理鲤',
'': '啦拉喇',
'': '吃持迟',
'': '哦喔噢',
'': '呀压',
'': '',
'': '太抬台',
'': '',
'': '',
'': '以已',
'': '因应',
'': '啥沙傻',
'': '行型形',
'': '哈蛤铪',
'': '嘿黑嗨',
'': '嗯恩摁',
'': '哎爱埃',
'': '呜屋污',
'': '喂位未',
'': '嘛麻马',
'': '嗨害亥',
'': '哇娃蛙',
'': '咦意易',
'': '嘻西希'
}
def random_remove_punctuation(text: str) -> str:
"""随机处理标点符号,模拟人类打字习惯
Args:
text: 要处理的文本
Returns:
str: 处理后的文本
"""
result = ''
text_len = len(text)
for i, char in enumerate(text):
if char == '' and i == text_len - 1: # 结尾的句号
if random.random() > 0.4: # 80%概率删除结尾句号
continue
elif char == '':
rand = random.random()
if rand < 0.25: # 5%概率删除逗号
continue
elif rand < 0.25: # 20%概率把逗号变成空格
result += ' '
continue
result += char
return result
def add_typos(text: str) -> str:
"""随机给文本添加错别字
Args:
text: 要处理的文本
Returns:
str: 添加错别字后的文本
"""
TYPO_RATE = 0.02 # 控制错别字出现的概率(2%)
result = ""
for char in text:
if char in TYPO_DICT and random.random() < TYPO_RATE:
# 从可能的错别字中随机选择一个
typos = TYPO_DICT[char]
result += random.choice(typos)
else:
result += char
return result
def process_text_with_typos(text: str) -> str:
"""处理文本,添加错别字和处理标点符号
Args:
text: 要处理的文本
Returns:
str: 处理后的文本
"""
if random.random() < 0.9: # 90%概率进行处理
return random_remove_punctuation(add_typos(text))
return text