Merge branch 'debug' of https://github.com/SengokuCola/MaiMBot into refactor/unified_request
# Conflicts: # src/plugins/chat/utils.py
This commit is contained in:
@@ -89,7 +89,8 @@
|
|||||||
- 改进表情包发送逻辑
|
- 改进表情包发送逻辑
|
||||||
- 自动生成的回复逻辑,例如自生成的回复方向,回复风格
|
- 自动生成的回复逻辑,例如自生成的回复方向,回复风格
|
||||||
- 采用截断生成加快麦麦的反应速度
|
- 采用截断生成加快麦麦的反应速度
|
||||||
- 改进发送消息的触发:
|
- 改进发送消息的触发
|
||||||
|
-
|
||||||
|
|
||||||
## 📌 注意事项
|
## 📌 注意事项
|
||||||
纯编程外行,面向cursor编程,很多代码史一样多多包涵
|
纯编程外行,面向cursor编程,很多代码史一样多多包涵
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ ban_user_id = [] #禁止回复消息的QQ号
|
|||||||
|
|
||||||
[model.llm_reasoning] #R1
|
[model.llm_reasoning] #R1
|
||||||
name = "Pro/deepseek-ai/DeepSeek-R1"
|
name = "Pro/deepseek-ai/DeepSeek-R1"
|
||||||
|
# name = "Qwen/QwQ-32B"
|
||||||
base_url = "SILICONFLOW_BASE_URL"
|
base_url = "SILICONFLOW_BASE_URL"
|
||||||
key = "SILICONFLOW_KEY"
|
key = "SILICONFLOW_KEY"
|
||||||
|
|
||||||
|
|||||||
@@ -30,16 +30,6 @@ class EmojiManager:
|
|||||||
_instance = None
|
_instance = None
|
||||||
EMOJI_DIR = "data/emoji" # 表情包存储目录
|
EMOJI_DIR = "data/emoji" # 表情包存储目录
|
||||||
|
|
||||||
EMOTION_KEYWORDS = {
|
|
||||||
'happy': ['开心', '快乐', '高兴', '欢喜', '笑', '喜悦', '兴奋', '愉快', '乐', '好'],
|
|
||||||
'angry': ['生气', '愤怒', '恼火', '不爽', '火大', '怒', '气愤', '恼怒', '发火', '不满'],
|
|
||||||
'sad': ['伤心', '难过', '悲伤', '痛苦', '哭', '忧伤', '悲痛', '哀伤', '委屈', '失落'],
|
|
||||||
'surprised': ['惊讶', '震惊', '吃惊', '意外', '惊', '诧异', '惊奇', '惊喜', '不敢相信', '目瞪口呆'],
|
|
||||||
'disgusted': ['恶心', '讨厌', '厌恶', '反感', '嫌弃', '恶', '嫌恶', '憎恶', '不喜欢', '烦'],
|
|
||||||
'fearful': ['害怕', '恐惧', '惊恐', '担心', '怕', '惊吓', '惊慌', '畏惧', '胆怯', '惧'],
|
|
||||||
'neutral': ['普通', '一般', '还行', '正常', '平静', '平淡', '一般般', '凑合', '还好', '就这样']
|
|
||||||
}
|
|
||||||
|
|
||||||
def __new__(cls):
|
def __new__(cls):
|
||||||
if cls._instance is None:
|
if cls._instance is None:
|
||||||
cls._instance = super().__new__(cls)
|
cls._instance = super().__new__(cls)
|
||||||
|
|||||||
@@ -84,7 +84,8 @@ class PromptBuilder:
|
|||||||
relevant_memories = await hippocampus.get_relevant_memories(
|
relevant_memories = await hippocampus.get_relevant_memories(
|
||||||
text=message_txt,
|
text=message_txt,
|
||||||
max_topics=5,
|
max_topics=5,
|
||||||
similarity_threshold=0.4
|
similarity_threshold=0.4,
|
||||||
|
max_memory_num=5
|
||||||
)
|
)
|
||||||
|
|
||||||
if relevant_memories:
|
if relevant_memories:
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from nonebot import get_driver
|
|||||||
from ..models.utils_model import LLM_request
|
from ..models.utils_model import LLM_request
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import jieba
|
import jieba
|
||||||
|
from ..utils.typo_generator import ChineseTypoGenerator
|
||||||
|
|
||||||
driver = get_driver()
|
driver = get_driver()
|
||||||
config = driver.config
|
config = driver.config
|
||||||
@@ -296,76 +297,6 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
|||||||
return sentences_done
|
return sentences_done
|
||||||
|
|
||||||
|
|
||||||
# 常见的错别字映射
|
|
||||||
TYPO_DICT = {
|
|
||||||
'的': '地得',
|
|
||||||
'了': '咯啦勒',
|
|
||||||
'吗': '嘛麻',
|
|
||||||
'吧': '八把罢',
|
|
||||||
'是': '事',
|
|
||||||
'在': '再在',
|
|
||||||
'和': '合',
|
|
||||||
'有': '又',
|
|
||||||
'我': '沃窝喔',
|
|
||||||
'你': '泥尼拟',
|
|
||||||
'他': '它她塔祂',
|
|
||||||
'们': '门',
|
|
||||||
'啊': '阿哇',
|
|
||||||
'呢': '呐捏',
|
|
||||||
'都': '豆读毒',
|
|
||||||
'很': '狠',
|
|
||||||
'会': '回汇',
|
|
||||||
'去': '趣取曲',
|
|
||||||
'做': '作坐',
|
|
||||||
'想': '相像',
|
|
||||||
'说': '说税睡',
|
|
||||||
'看': '砍堪刊',
|
|
||||||
'来': '来莱赖',
|
|
||||||
'好': '号毫豪',
|
|
||||||
'给': '给既继',
|
|
||||||
'过': '锅果裹',
|
|
||||||
'能': '嫩',
|
|
||||||
'为': '位未',
|
|
||||||
'什': '甚深伸',
|
|
||||||
'么': '末麽嘛',
|
|
||||||
'话': '话花划',
|
|
||||||
'知': '织直值',
|
|
||||||
'道': '到',
|
|
||||||
'听': '听停挺',
|
|
||||||
'见': '见件建',
|
|
||||||
'觉': '觉脚搅',
|
|
||||||
'得': '得德锝',
|
|
||||||
'着': '着找招',
|
|
||||||
'像': '向象想',
|
|
||||||
'等': '等灯登',
|
|
||||||
'谢': '谢写卸',
|
|
||||||
'对': '对队',
|
|
||||||
'里': '里理鲤',
|
|
||||||
'啦': '啦拉喇',
|
|
||||||
'吃': '吃持迟',
|
|
||||||
'哦': '哦喔噢',
|
|
||||||
'呀': '呀压',
|
|
||||||
'要': '药',
|
|
||||||
'太': '太抬台',
|
|
||||||
'快': '块',
|
|
||||||
'点': '店',
|
|
||||||
'以': '以已',
|
|
||||||
'因': '因应',
|
|
||||||
'啥': '啥沙傻',
|
|
||||||
'行': '行型形',
|
|
||||||
'哈': '哈蛤铪',
|
|
||||||
'嘿': '嘿黑嗨',
|
|
||||||
'嗯': '嗯恩摁',
|
|
||||||
'哎': '哎爱埃',
|
|
||||||
'呜': '呜屋污',
|
|
||||||
'喂': '喂位未',
|
|
||||||
'嘛': '嘛麻马',
|
|
||||||
'嗨': '嗨害亥',
|
|
||||||
'哇': '哇娃蛙',
|
|
||||||
'咦': '咦意易',
|
|
||||||
'嘻': '嘻西希'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def random_remove_punctuation(text: str) -> str:
|
def random_remove_punctuation(text: str) -> str:
|
||||||
"""随机处理标点符号,模拟人类打字习惯
|
"""随机处理标点符号,模拟人类打字习惯
|
||||||
@@ -394,18 +325,6 @@ def random_remove_punctuation(text: str) -> str:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def add_typos(text: str) -> str:
|
|
||||||
TYPO_RATE = 0.02 # 控制错别字出现的概率(2%)
|
|
||||||
result = ""
|
|
||||||
for char in text:
|
|
||||||
if char in TYPO_DICT and random.random() < TYPO_RATE:
|
|
||||||
# 从可能的错别字中随机选择一个
|
|
||||||
typos = TYPO_DICT[char]
|
|
||||||
result += random.choice(typos)
|
|
||||||
else:
|
|
||||||
result += char
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def process_llm_response(text: str) -> List[str]:
|
def process_llm_response(text: str) -> List[str]:
|
||||||
# processed_response = process_text_with_typos(content)
|
# processed_response = process_text_with_typos(content)
|
||||||
@@ -413,7 +332,14 @@ def process_llm_response(text: str) -> List[str]:
|
|||||||
print(f"回复过长 ({len(text)} 字符),返回默认回复")
|
print(f"回复过长 ({len(text)} 字符),返回默认回复")
|
||||||
return ['懒得说']
|
return ['懒得说']
|
||||||
# 处理长消息
|
# 处理长消息
|
||||||
sentences = split_into_sentences_w_remove_punctuation(add_typos(text))
|
typo_generator = ChineseTypoGenerator(
|
||||||
|
error_rate=0.03,
|
||||||
|
min_freq=7,
|
||||||
|
tone_error_rate=0.2,
|
||||||
|
word_replace_rate=0.02
|
||||||
|
)
|
||||||
|
typoed_text = typo_generator.create_typo_sentence(text)[0]
|
||||||
|
sentences = split_into_sentences_w_remove_punctuation(typoed_text)
|
||||||
# 检查分割后的消息数量是否过多(超过3条)
|
# 检查分割后的消息数量是否过多(超过3条)
|
||||||
if len(sentences) > 4:
|
if len(sentences) > 4:
|
||||||
print(f"分割后消息数量过多 ({len(sentences)} 条),返回默认回复")
|
print(f"分割后消息数量过多 ({len(sentences)} 条),返回默认回复")
|
||||||
|
|||||||
@@ -181,13 +181,19 @@ class Hippocampus:
|
|||||||
topic_num = self.calculate_topic_num(input_text, compress_rate)
|
topic_num = self.calculate_topic_num(input_text, compress_rate)
|
||||||
topics_response = await self.llm_model_get_topic.generate_response(self.find_topic_llm(input_text, topic_num))
|
topics_response = await self.llm_model_get_topic.generate_response(self.find_topic_llm(input_text, topic_num))
|
||||||
# 修改话题处理逻辑
|
# 修改话题处理逻辑
|
||||||
print(f"话题: {topics_response[0]}")
|
# 定义需要过滤的关键词
|
||||||
topics = [topic.strip() for topic in topics_response[0].replace(",", ",").replace("、", ",").replace(" ", ",").split(",") if topic.strip()]
|
filter_keywords = ['表情包', '图片', '回复', '聊天记录']
|
||||||
print(f"话题: {topics}")
|
|
||||||
|
|
||||||
# 创建所有话题的请求任务
|
# 过滤topics
|
||||||
|
topics = [topic.strip() for topic in topics_response[0].replace(",", ",").replace("、", ",").replace(" ", ",").split(",") if topic.strip()]
|
||||||
|
filtered_topics = [topic for topic in topics if not any(keyword in topic for keyword in filter_keywords)]
|
||||||
|
|
||||||
|
# print(f"原始话题: {topics}")
|
||||||
|
print(f"过滤后话题: {filtered_topics}")
|
||||||
|
|
||||||
|
# 使用过滤后的话题继续处理
|
||||||
tasks = []
|
tasks = []
|
||||||
for topic in topics:
|
for topic in filtered_topics:
|
||||||
topic_what_prompt = self.topic_what(input_text, topic)
|
topic_what_prompt = self.topic_what(input_text, topic)
|
||||||
# 创建异步任务
|
# 创建异步任务
|
||||||
task = self.llm_model_summary.generate_response_async(topic_what_prompt)
|
task = self.llm_model_summary.generate_response_async(topic_what_prompt)
|
||||||
@@ -501,9 +507,9 @@ class Hippocampus:
|
|||||||
list: 识别出的主题列表
|
list: 识别出的主题列表
|
||||||
"""
|
"""
|
||||||
topics_response = await self.llm_model_get_topic.generate_response(self.find_topic_llm(text, 5))
|
topics_response = await self.llm_model_get_topic.generate_response(self.find_topic_llm(text, 5))
|
||||||
print(f"话题: {topics_response[0]}")
|
# print(f"话题: {topics_response[0]}")
|
||||||
topics = [topic.strip() for topic in topics_response[0].replace(",", ",").replace("、", ",").replace(" ", ",").split(",") if topic.strip()]
|
topics = [topic.strip() for topic in topics_response[0].replace(",", ",").replace("、", ",").replace(" ", ",").split(",") if topic.strip()]
|
||||||
print(f"话题: {topics}")
|
# print(f"话题: {topics}")
|
||||||
|
|
||||||
return topics
|
return topics
|
||||||
|
|
||||||
@@ -579,7 +585,7 @@ class Hippocampus:
|
|||||||
print(f"\033[1;32m[记忆激活]\033[0m 识别出的主题: {identified_topics}")
|
print(f"\033[1;32m[记忆激活]\033[0m 识别出的主题: {identified_topics}")
|
||||||
|
|
||||||
if not identified_topics:
|
if not identified_topics:
|
||||||
print(f"\033[1;32m[记忆激活]\033[0m 未识别出主题,返回0")
|
# print(f"\033[1;32m[记忆激活]\033[0m 未识别出主题,返回0")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# 查找相似主题
|
# 查找相似主题
|
||||||
@@ -644,7 +650,7 @@ class Hippocampus:
|
|||||||
|
|
||||||
return int(activation)
|
return int(activation)
|
||||||
|
|
||||||
async def get_relevant_memories(self, text: str, max_topics: int = 5, similarity_threshold: float = 0.4) -> list:
|
async def get_relevant_memories(self, text: str, max_topics: int = 5, similarity_threshold: float = 0.4, max_memory_num: int = 5) -> list:
|
||||||
"""根据输入文本获取相关的记忆内容"""
|
"""根据输入文本获取相关的记忆内容"""
|
||||||
# 识别主题
|
# 识别主题
|
||||||
identified_topics = await self._identify_topics(text)
|
identified_topics = await self._identify_topics(text)
|
||||||
@@ -665,6 +671,9 @@ class Hippocampus:
|
|||||||
# 获取该主题的记忆内容
|
# 获取该主题的记忆内容
|
||||||
first_layer, _ = self.memory_graph.get_related_item(topic, depth=1)
|
first_layer, _ = self.memory_graph.get_related_item(topic, depth=1)
|
||||||
if first_layer:
|
if first_layer:
|
||||||
|
# 如果记忆条数超过限制,随机选择指定数量的记忆
|
||||||
|
if len(first_layer) > max_memory_num/2:
|
||||||
|
first_layer = random.sample(first_layer, max_memory_num)
|
||||||
# 为每条记忆添加来源主题和相似度信息
|
# 为每条记忆添加来源主题和相似度信息
|
||||||
for memory in first_layer:
|
for memory in first_layer:
|
||||||
relevant_memories.append({
|
relevant_memories.append({
|
||||||
@@ -673,9 +682,13 @@ class Hippocampus:
|
|||||||
'content': memory
|
'content': memory
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# 如果记忆数量超过5个,随机选择5个
|
||||||
# 按相似度排序
|
# 按相似度排序
|
||||||
relevant_memories.sort(key=lambda x: x['similarity'], reverse=True)
|
relevant_memories.sort(key=lambda x: x['similarity'], reverse=True)
|
||||||
|
|
||||||
|
if len(relevant_memories) > max_memory_num:
|
||||||
|
relevant_memories = random.sample(relevant_memories, max_memory_num)
|
||||||
|
|
||||||
return relevant_memories
|
return relevant_memories
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -234,16 +234,22 @@ class Hippocampus:
|
|||||||
async def memory_compress(self, input_text, compress_rate=0.1):
|
async def memory_compress(self, input_text, compress_rate=0.1):
|
||||||
print(input_text)
|
print(input_text)
|
||||||
|
|
||||||
#获取topics
|
|
||||||
topic_num = self.calculate_topic_num(input_text, compress_rate)
|
topic_num = self.calculate_topic_num(input_text, compress_rate)
|
||||||
topics_response = await self.llm_model_get_topic.generate_response_async(self.find_topic_llm(input_text, topic_num))
|
topics_response = self.llm_model_get_topic.generate_response(self.find_topic_llm(input_text, topic_num))
|
||||||
# 修改话题处理逻辑
|
# 修改话题处理逻辑
|
||||||
|
# 定义需要过滤的关键词
|
||||||
|
filter_keywords = ['表情包', '图片', '回复', '聊天记录']
|
||||||
|
|
||||||
|
# 过滤topics
|
||||||
topics = [topic.strip() for topic in topics_response[0].replace(",", ",").replace("、", ",").replace(" ", ",").split(",") if topic.strip()]
|
topics = [topic.strip() for topic in topics_response[0].replace(",", ",").replace("、", ",").replace(" ", ",").split(",") if topic.strip()]
|
||||||
print(f"话题: {topics}")
|
filtered_topics = [topic for topic in topics if not any(keyword in topic for keyword in filter_keywords)]
|
||||||
|
|
||||||
|
# print(f"原始话题: {topics}")
|
||||||
|
print(f"过滤后话题: {filtered_topics}")
|
||||||
|
|
||||||
# 创建所有话题的请求任务
|
# 创建所有话题的请求任务
|
||||||
tasks = []
|
tasks = []
|
||||||
for topic in topics:
|
for topic in filtered_topics:
|
||||||
topic_what_prompt = self.topic_what(input_text, topic)
|
topic_what_prompt = self.topic_what(input_text, topic)
|
||||||
# 创建异步任务
|
# 创建异步任务
|
||||||
task = self.llm_model_small.generate_response_async(topic_what_prompt)
|
task = self.llm_model_small.generate_response_async(topic_what_prompt)
|
||||||
@@ -652,6 +658,21 @@ def visualize_graph_lite(memory_graph: Memory_graph, color_by_memory: bool = Fal
|
|||||||
# 创建一个新图用于可视化
|
# 创建一个新图用于可视化
|
||||||
H = G.copy()
|
H = G.copy()
|
||||||
|
|
||||||
|
# 过滤掉内容数量小于2的节点
|
||||||
|
nodes_to_remove = []
|
||||||
|
for node in H.nodes():
|
||||||
|
memory_items = H.nodes[node].get('memory_items', [])
|
||||||
|
memory_count = len(memory_items) if isinstance(memory_items, list) else (1 if memory_items else 0)
|
||||||
|
if memory_count < 2:
|
||||||
|
nodes_to_remove.append(node)
|
||||||
|
|
||||||
|
H.remove_nodes_from(nodes_to_remove)
|
||||||
|
|
||||||
|
# 如果没有符合条件的节点,直接返回
|
||||||
|
if len(H.nodes()) == 0:
|
||||||
|
print("没有找到内容数量大于等于2的节点")
|
||||||
|
return
|
||||||
|
|
||||||
# 计算节点大小和颜色
|
# 计算节点大小和颜色
|
||||||
node_colors = []
|
node_colors = []
|
||||||
node_sizes = []
|
node_sizes = []
|
||||||
@@ -704,7 +725,7 @@ def visualize_graph_lite(memory_graph: Memory_graph, color_by_memory: bool = Fal
|
|||||||
edge_color='gray',
|
edge_color='gray',
|
||||||
width=1.5) # 统一的边宽度
|
width=1.5) # 统一的边宽度
|
||||||
|
|
||||||
title = '记忆图谱可视化 - 节点大小表示记忆数量\n节点颜色:蓝(弱连接)到红(强连接)渐变,边的透明度表示连接强度\n连接强度越大的节点距离越近'
|
title = '记忆图谱可视化(仅显示内容≥2的节点)\n节点大小表示记忆数量\n节点颜色:蓝(弱连接)到红(强连接)渐变,边的透明度表示连接强度\n连接强度越大的节点距离越近'
|
||||||
plt.title(title, fontsize=16, fontfamily='SimHei')
|
plt.title(title, fontsize=16, fontfamily='SimHei')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|||||||
437
src/plugins/utils/typo_generator.py
Normal file
437
src/plugins/utils/typo_generator.py
Normal file
@@ -0,0 +1,437 @@
|
|||||||
|
"""
|
||||||
|
错别字生成器 - 基于拼音和字频的中文错别字生成工具
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pypinyin import pinyin, Style
|
||||||
|
from collections import defaultdict
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import jieba
|
||||||
|
from pathlib import Path
|
||||||
|
import random
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
class ChineseTypoGenerator:
|
||||||
|
def __init__(self,
|
||||||
|
error_rate=0.3,
|
||||||
|
min_freq=5,
|
||||||
|
tone_error_rate=0.2,
|
||||||
|
word_replace_rate=0.3,
|
||||||
|
max_freq_diff=200):
|
||||||
|
"""
|
||||||
|
初始化错别字生成器
|
||||||
|
|
||||||
|
参数:
|
||||||
|
error_rate: 单字替换概率
|
||||||
|
min_freq: 最小字频阈值
|
||||||
|
tone_error_rate: 声调错误概率
|
||||||
|
word_replace_rate: 整词替换概率
|
||||||
|
max_freq_diff: 最大允许的频率差异
|
||||||
|
"""
|
||||||
|
self.error_rate = error_rate
|
||||||
|
self.min_freq = min_freq
|
||||||
|
self.tone_error_rate = tone_error_rate
|
||||||
|
self.word_replace_rate = word_replace_rate
|
||||||
|
self.max_freq_diff = max_freq_diff
|
||||||
|
|
||||||
|
# 加载数据
|
||||||
|
print("正在加载汉字数据库,请稍候...")
|
||||||
|
self.pinyin_dict = self._create_pinyin_dict()
|
||||||
|
self.char_frequency = self._load_or_create_char_frequency()
|
||||||
|
|
||||||
|
def _load_or_create_char_frequency(self):
|
||||||
|
"""
|
||||||
|
加载或创建汉字频率字典
|
||||||
|
"""
|
||||||
|
cache_file = Path("char_frequency.json")
|
||||||
|
|
||||||
|
# 如果缓存文件存在,直接加载
|
||||||
|
if cache_file.exists():
|
||||||
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
# 使用内置的词频文件
|
||||||
|
char_freq = defaultdict(int)
|
||||||
|
dict_path = os.path.join(os.path.dirname(jieba.__file__), 'dict.txt')
|
||||||
|
|
||||||
|
# 读取jieba的词典文件
|
||||||
|
with open(dict_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
word, freq = line.strip().split()[:2]
|
||||||
|
# 对词中的每个字进行频率累加
|
||||||
|
for char in word:
|
||||||
|
if self._is_chinese_char(char):
|
||||||
|
char_freq[char] += int(freq)
|
||||||
|
|
||||||
|
# 归一化频率值
|
||||||
|
max_freq = max(char_freq.values())
|
||||||
|
normalized_freq = {char: freq/max_freq * 1000 for char, freq in char_freq.items()}
|
||||||
|
|
||||||
|
# 保存到缓存文件
|
||||||
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(normalized_freq, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
return normalized_freq
|
||||||
|
|
||||||
|
def _create_pinyin_dict(self):
|
||||||
|
"""
|
||||||
|
创建拼音到汉字的映射字典
|
||||||
|
"""
|
||||||
|
# 常用汉字范围
|
||||||
|
chars = [chr(i) for i in range(0x4e00, 0x9fff)]
|
||||||
|
pinyin_dict = defaultdict(list)
|
||||||
|
|
||||||
|
# 为每个汉字建立拼音映射
|
||||||
|
for char in chars:
|
||||||
|
try:
|
||||||
|
py = pinyin(char, style=Style.TONE3)[0][0]
|
||||||
|
pinyin_dict[py].append(char)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return pinyin_dict
|
||||||
|
|
||||||
|
def _is_chinese_char(self, char):
|
||||||
|
"""
|
||||||
|
判断是否为汉字
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return '\u4e00' <= char <= '\u9fff'
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_pinyin(self, sentence):
|
||||||
|
"""
|
||||||
|
将中文句子拆分成单个汉字并获取其拼音
|
||||||
|
"""
|
||||||
|
# 将句子拆分成单个字符
|
||||||
|
characters = list(sentence)
|
||||||
|
|
||||||
|
# 获取每个字符的拼音
|
||||||
|
result = []
|
||||||
|
for char in characters:
|
||||||
|
# 跳过空格和非汉字字符
|
||||||
|
if char.isspace() or not self._is_chinese_char(char):
|
||||||
|
continue
|
||||||
|
# 获取拼音(数字声调)
|
||||||
|
py = pinyin(char, style=Style.TONE3)[0][0]
|
||||||
|
result.append((char, py))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _get_similar_tone_pinyin(self, py):
|
||||||
|
"""
|
||||||
|
获取相似声调的拼音
|
||||||
|
"""
|
||||||
|
# 检查拼音是否为空或无效
|
||||||
|
if not py or len(py) < 1:
|
||||||
|
return py
|
||||||
|
|
||||||
|
# 如果最后一个字符不是数字,说明可能是轻声或其他特殊情况
|
||||||
|
if not py[-1].isdigit():
|
||||||
|
# 为非数字结尾的拼音添加数字声调1
|
||||||
|
return py + '1'
|
||||||
|
|
||||||
|
base = py[:-1] # 去掉声调
|
||||||
|
tone = int(py[-1]) # 获取声调
|
||||||
|
|
||||||
|
# 处理轻声(通常用5表示)或无效声调
|
||||||
|
if tone not in [1, 2, 3, 4]:
|
||||||
|
return base + str(random.choice([1, 2, 3, 4]))
|
||||||
|
|
||||||
|
# 正常处理声调
|
||||||
|
possible_tones = [1, 2, 3, 4]
|
||||||
|
possible_tones.remove(tone) # 移除原声调
|
||||||
|
new_tone = random.choice(possible_tones) # 随机选择一个新声调
|
||||||
|
return base + str(new_tone)
|
||||||
|
|
||||||
|
def _calculate_replacement_probability(self, orig_freq, target_freq):
|
||||||
|
"""
|
||||||
|
根据频率差计算替换概率
|
||||||
|
"""
|
||||||
|
if target_freq > orig_freq:
|
||||||
|
return 1.0 # 如果替换字频率更高,保持原有概率
|
||||||
|
|
||||||
|
freq_diff = orig_freq - target_freq
|
||||||
|
if freq_diff > self.max_freq_diff:
|
||||||
|
return 0.0 # 频率差太大,不替换
|
||||||
|
|
||||||
|
# 使用指数衰减函数计算概率
|
||||||
|
# 频率差为0时概率为1,频率差为max_freq_diff时概率接近0
|
||||||
|
return math.exp(-3 * freq_diff / self.max_freq_diff)
|
||||||
|
|
||||||
|
def _get_similar_frequency_chars(self, char, py, num_candidates=5):
|
||||||
|
"""
|
||||||
|
获取与给定字频率相近的同音字,可能包含声调错误
|
||||||
|
"""
|
||||||
|
homophones = []
|
||||||
|
|
||||||
|
# 有一定概率使用错误声调
|
||||||
|
if random.random() < self.tone_error_rate:
|
||||||
|
wrong_tone_py = self._get_similar_tone_pinyin(py)
|
||||||
|
homophones.extend(self.pinyin_dict[wrong_tone_py])
|
||||||
|
|
||||||
|
# 添加正确声调的同音字
|
||||||
|
homophones.extend(self.pinyin_dict[py])
|
||||||
|
|
||||||
|
if not homophones:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 获取原字的频率
|
||||||
|
orig_freq = self.char_frequency.get(char, 0)
|
||||||
|
|
||||||
|
# 计算所有同音字与原字的频率差,并过滤掉低频字
|
||||||
|
freq_diff = [(h, self.char_frequency.get(h, 0))
|
||||||
|
for h in homophones
|
||||||
|
if h != char and self.char_frequency.get(h, 0) >= self.min_freq]
|
||||||
|
|
||||||
|
if not freq_diff:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 计算每个候选字的替换概率
|
||||||
|
candidates_with_prob = []
|
||||||
|
for h, freq in freq_diff:
|
||||||
|
prob = self._calculate_replacement_probability(orig_freq, freq)
|
||||||
|
if prob > 0: # 只保留有效概率的候选字
|
||||||
|
candidates_with_prob.append((h, prob))
|
||||||
|
|
||||||
|
if not candidates_with_prob:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 根据概率排序
|
||||||
|
candidates_with_prob.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
# 返回概率最高的几个字
|
||||||
|
return [char for char, _ in candidates_with_prob[:num_candidates]]
|
||||||
|
|
||||||
|
def _get_word_pinyin(self, word):
|
||||||
|
"""
|
||||||
|
获取词语的拼音列表
|
||||||
|
"""
|
||||||
|
return [py[0] for py in pinyin(word, style=Style.TONE3)]
|
||||||
|
|
||||||
|
def _segment_sentence(self, sentence):
|
||||||
|
"""
|
||||||
|
使用jieba分词,返回词语列表
|
||||||
|
"""
|
||||||
|
return list(jieba.cut(sentence))
|
||||||
|
|
||||||
|
def _get_word_homophones(self, word):
|
||||||
|
"""
|
||||||
|
获取整个词的同音词,只返回高频的有意义词语
|
||||||
|
"""
|
||||||
|
if len(word) == 1:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 获取词的拼音
|
||||||
|
word_pinyin = self._get_word_pinyin(word)
|
||||||
|
|
||||||
|
# 遍历所有可能的同音字组合
|
||||||
|
candidates = []
|
||||||
|
for py in word_pinyin:
|
||||||
|
chars = self.pinyin_dict.get(py, [])
|
||||||
|
if not chars:
|
||||||
|
return []
|
||||||
|
candidates.append(chars)
|
||||||
|
|
||||||
|
# 生成所有可能的组合
|
||||||
|
import itertools
|
||||||
|
all_combinations = itertools.product(*candidates)
|
||||||
|
|
||||||
|
# 获取jieba词典和词频信息
|
||||||
|
dict_path = os.path.join(os.path.dirname(jieba.__file__), 'dict.txt')
|
||||||
|
valid_words = {} # 改用字典存储词语及其频率
|
||||||
|
with open(dict_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
parts = line.strip().split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
word_text = parts[0]
|
||||||
|
word_freq = float(parts[1]) # 获取词频
|
||||||
|
valid_words[word_text] = word_freq
|
||||||
|
|
||||||
|
# 获取原词的词频作为参考
|
||||||
|
original_word_freq = valid_words.get(word, 0)
|
||||||
|
min_word_freq = original_word_freq * 0.1 # 设置最小词频为原词频的10%
|
||||||
|
|
||||||
|
# 过滤和计算频率
|
||||||
|
homophones = []
|
||||||
|
for combo in all_combinations:
|
||||||
|
new_word = ''.join(combo)
|
||||||
|
if new_word != word and new_word in valid_words:
|
||||||
|
new_word_freq = valid_words[new_word]
|
||||||
|
# 只保留词频达到阈值的词
|
||||||
|
if new_word_freq >= min_word_freq:
|
||||||
|
# 计算词的平均字频(考虑字频和词频)
|
||||||
|
char_avg_freq = sum(self.char_frequency.get(c, 0) for c in new_word) / len(new_word)
|
||||||
|
# 综合评分:结合词频和字频
|
||||||
|
combined_score = (new_word_freq * 0.7 + char_avg_freq * 0.3)
|
||||||
|
if combined_score >= self.min_freq:
|
||||||
|
homophones.append((new_word, combined_score))
|
||||||
|
|
||||||
|
# 按综合分数排序并限制返回数量
|
||||||
|
sorted_homophones = sorted(homophones, key=lambda x: x[1], reverse=True)
|
||||||
|
return [word for word, _ in sorted_homophones[:5]] # 限制返回前5个结果
|
||||||
|
|
||||||
|
def create_typo_sentence(self, sentence):
|
||||||
|
"""
|
||||||
|
创建包含同音字错误的句子,支持词语级别和字级别的替换
|
||||||
|
|
||||||
|
参数:
|
||||||
|
sentence: 输入的中文句子
|
||||||
|
|
||||||
|
返回:
|
||||||
|
typo_sentence: 包含错别字的句子
|
||||||
|
typo_info: 错别字信息列表
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
typo_info = []
|
||||||
|
|
||||||
|
# 分词
|
||||||
|
words = self._segment_sentence(sentence)
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
# 如果是标点符号或空格,直接添加
|
||||||
|
if all(not self._is_chinese_char(c) for c in word):
|
||||||
|
result.append(word)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 获取词语的拼音
|
||||||
|
word_pinyin = self._get_word_pinyin(word)
|
||||||
|
|
||||||
|
# 尝试整词替换
|
||||||
|
if len(word) > 1 and random.random() < self.word_replace_rate:
|
||||||
|
word_homophones = self._get_word_homophones(word)
|
||||||
|
if word_homophones:
|
||||||
|
typo_word = random.choice(word_homophones)
|
||||||
|
# 计算词的平均频率
|
||||||
|
orig_freq = sum(self.char_frequency.get(c, 0) for c in word) / len(word)
|
||||||
|
typo_freq = sum(self.char_frequency.get(c, 0) for c in typo_word) / len(typo_word)
|
||||||
|
|
||||||
|
# 添加到结果中
|
||||||
|
result.append(typo_word)
|
||||||
|
typo_info.append((word, typo_word,
|
||||||
|
' '.join(word_pinyin),
|
||||||
|
' '.join(self._get_word_pinyin(typo_word)),
|
||||||
|
orig_freq, typo_freq))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果不进行整词替换,则进行单字替换
|
||||||
|
if len(word) == 1:
|
||||||
|
char = word
|
||||||
|
py = word_pinyin[0]
|
||||||
|
if random.random() < self.error_rate:
|
||||||
|
similar_chars = self._get_similar_frequency_chars(char, py)
|
||||||
|
if similar_chars:
|
||||||
|
typo_char = random.choice(similar_chars)
|
||||||
|
typo_freq = self.char_frequency.get(typo_char, 0)
|
||||||
|
orig_freq = self.char_frequency.get(char, 0)
|
||||||
|
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
|
||||||
|
if random.random() < replace_prob:
|
||||||
|
result.append(typo_char)
|
||||||
|
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
||||||
|
typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq))
|
||||||
|
continue
|
||||||
|
result.append(char)
|
||||||
|
else:
|
||||||
|
# 处理多字词的单字替换
|
||||||
|
word_result = []
|
||||||
|
for i, (char, py) in enumerate(zip(word, word_pinyin)):
|
||||||
|
# 词中的字替换概率降低
|
||||||
|
word_error_rate = self.error_rate * (0.7 ** (len(word) - 1))
|
||||||
|
|
||||||
|
if random.random() < word_error_rate:
|
||||||
|
similar_chars = self._get_similar_frequency_chars(char, py)
|
||||||
|
if similar_chars:
|
||||||
|
typo_char = random.choice(similar_chars)
|
||||||
|
typo_freq = self.char_frequency.get(typo_char, 0)
|
||||||
|
orig_freq = self.char_frequency.get(char, 0)
|
||||||
|
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
|
||||||
|
if random.random() < replace_prob:
|
||||||
|
word_result.append(typo_char)
|
||||||
|
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
||||||
|
typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq))
|
||||||
|
continue
|
||||||
|
word_result.append(char)
|
||||||
|
result.append(''.join(word_result))
|
||||||
|
|
||||||
|
return ''.join(result), typo_info
|
||||||
|
|
||||||
|
def format_typo_info(self, typo_info):
|
||||||
|
"""
|
||||||
|
格式化错别字信息
|
||||||
|
|
||||||
|
参数:
|
||||||
|
typo_info: 错别字信息列表
|
||||||
|
|
||||||
|
返回:
|
||||||
|
格式化后的错别字信息字符串
|
||||||
|
"""
|
||||||
|
if not typo_info:
|
||||||
|
return "未生成错别字"
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for orig, typo, orig_py, typo_py, orig_freq, typo_freq in typo_info:
|
||||||
|
# 判断是否为词语替换
|
||||||
|
is_word = ' ' in orig_py
|
||||||
|
if is_word:
|
||||||
|
error_type = "整词替换"
|
||||||
|
else:
|
||||||
|
tone_error = orig_py[:-1] == typo_py[:-1] and orig_py[-1] != typo_py[-1]
|
||||||
|
error_type = "声调错误" if tone_error else "同音字替换"
|
||||||
|
|
||||||
|
result.append(f"原文:{orig}({orig_py}) [频率:{orig_freq:.2f}] -> "
|
||||||
|
f"替换:{typo}({typo_py}) [频率:{typo_freq:.2f}] [{error_type}]")
|
||||||
|
|
||||||
|
return "\n".join(result)
|
||||||
|
|
||||||
|
def set_params(self, **kwargs):
|
||||||
|
"""
|
||||||
|
设置参数
|
||||||
|
|
||||||
|
可设置参数:
|
||||||
|
error_rate: 单字替换概率
|
||||||
|
min_freq: 最小字频阈值
|
||||||
|
tone_error_rate: 声调错误概率
|
||||||
|
word_replace_rate: 整词替换概率
|
||||||
|
max_freq_diff: 最大允许的频率差异
|
||||||
|
"""
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if hasattr(self, key):
|
||||||
|
setattr(self, key, value)
|
||||||
|
print(f"参数 {key} 已设置为 {value}")
|
||||||
|
else:
|
||||||
|
print(f"警告: 参数 {key} 不存在")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 创建错别字生成器实例
|
||||||
|
typo_generator = ChineseTypoGenerator(
|
||||||
|
error_rate=0.03,
|
||||||
|
min_freq=7,
|
||||||
|
tone_error_rate=0.02,
|
||||||
|
word_replace_rate=0.3
|
||||||
|
)
|
||||||
|
|
||||||
|
# 获取用户输入
|
||||||
|
sentence = input("请输入中文句子:")
|
||||||
|
|
||||||
|
# 创建包含错别字的句子
|
||||||
|
start_time = time.time()
|
||||||
|
typo_sentence, typo_info = typo_generator.create_typo_sentence(sentence)
|
||||||
|
|
||||||
|
# 打印结果
|
||||||
|
print("\n原句:", sentence)
|
||||||
|
print("错字版:", typo_sentence)
|
||||||
|
|
||||||
|
# 打印错别字信息
|
||||||
|
if typo_info:
|
||||||
|
print("\n错别字信息:")
|
||||||
|
print(typo_generator.format_typo_info(typo_info))
|
||||||
|
|
||||||
|
# 计算并打印总耗时
|
||||||
|
end_time = time.time()
|
||||||
|
total_time = end_time - start_time
|
||||||
|
print(f"\n总耗时:{total_time:.2f}秒")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
325
src/test/typo.py
325
src/test/typo.py
@@ -1,73 +1,46 @@
|
|||||||
"""
|
"""
|
||||||
错别字生成器 - 流程说明
|
错别字生成器 - 基于拼音和字频的中文错别字生成工具
|
||||||
|
|
||||||
整体替换逻辑:
|
|
||||||
1. 数据准备
|
|
||||||
- 加载字频词典:使用jieba词典计算汉字使用频率
|
|
||||||
- 创建拼音映射:建立拼音到汉字的映射关系
|
|
||||||
- 加载词频信息:从jieba词典获取词语使用频率
|
|
||||||
|
|
||||||
2. 分词处理
|
|
||||||
- 使用jieba将输入句子分词
|
|
||||||
- 区分单字词和多字词
|
|
||||||
- 保留标点符号和空格
|
|
||||||
|
|
||||||
3. 词语级别替换(针对多字词)
|
|
||||||
- 触发条件:词长>1 且 随机概率<0.3
|
|
||||||
- 替换流程:
|
|
||||||
a. 获取词语拼音
|
|
||||||
b. 生成所有可能的同音字组合
|
|
||||||
c. 过滤条件:
|
|
||||||
- 必须是jieba词典中的有效词
|
|
||||||
- 词频必须达到原词频的10%以上
|
|
||||||
- 综合评分(词频70%+字频30%)必须达到阈值
|
|
||||||
d. 按综合评分排序,选择最合适的替换词
|
|
||||||
|
|
||||||
4. 字级别替换(针对单字词或未进行整词替换的多字词)
|
|
||||||
- 单字替换概率:0.3
|
|
||||||
- 多字词中的单字替换概率:0.3 * (0.7 ^ (词长-1))
|
|
||||||
- 替换流程:
|
|
||||||
a. 获取字的拼音
|
|
||||||
b. 声调错误处理(20%概率)
|
|
||||||
c. 获取同音字列表
|
|
||||||
d. 过滤条件:
|
|
||||||
- 字频必须达到最小阈值
|
|
||||||
- 频率差异不能过大(指数衰减计算)
|
|
||||||
e. 按频率排序选择替换字
|
|
||||||
|
|
||||||
5. 频率控制机制
|
|
||||||
- 字频控制:使用归一化的字频(0-1000范围)
|
|
||||||
- 词频控制:使用jieba词典中的词频
|
|
||||||
- 频率差异计算:使用指数衰减函数
|
|
||||||
- 最小频率阈值:确保替换字/词不会太生僻
|
|
||||||
|
|
||||||
6. 输出信息
|
|
||||||
- 原文和错字版本的对照
|
|
||||||
- 每个替换的详细信息(原字/词、替换后字/词、拼音、频率)
|
|
||||||
- 替换类型说明(整词替换/声调错误/同音字替换)
|
|
||||||
- 词语分析和完整拼音
|
|
||||||
|
|
||||||
注意事项:
|
|
||||||
1. 所有替换都必须使用有意义的词语
|
|
||||||
2. 替换词的使用频率不能过低
|
|
||||||
3. 多字词优先考虑整词替换
|
|
||||||
4. 考虑声调变化的情况
|
|
||||||
5. 保持标点符号和空格不变
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pypinyin import pinyin, Style
|
from pypinyin import pinyin, Style
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unicodedata
|
|
||||||
import jieba
|
import jieba
|
||||||
import jieba.posseg as pseg
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import random
|
import random
|
||||||
import math
|
import math
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def load_or_create_char_frequency():
|
class ChineseTypoGenerator:
|
||||||
|
def __init__(self,
|
||||||
|
error_rate=0.3,
|
||||||
|
min_freq=5,
|
||||||
|
tone_error_rate=0.2,
|
||||||
|
word_replace_rate=0.3,
|
||||||
|
max_freq_diff=200):
|
||||||
|
"""
|
||||||
|
初始化错别字生成器
|
||||||
|
|
||||||
|
参数:
|
||||||
|
error_rate: 单字替换概率
|
||||||
|
min_freq: 最小字频阈值
|
||||||
|
tone_error_rate: 声调错误概率
|
||||||
|
word_replace_rate: 整词替换概率
|
||||||
|
max_freq_diff: 最大允许的频率差异
|
||||||
|
"""
|
||||||
|
self.error_rate = error_rate
|
||||||
|
self.min_freq = min_freq
|
||||||
|
self.tone_error_rate = tone_error_rate
|
||||||
|
self.word_replace_rate = word_replace_rate
|
||||||
|
self.max_freq_diff = max_freq_diff
|
||||||
|
|
||||||
|
# 加载数据
|
||||||
|
print("正在加载汉字数据库,请稍候...")
|
||||||
|
self.pinyin_dict = self._create_pinyin_dict()
|
||||||
|
self.char_frequency = self._load_or_create_char_frequency()
|
||||||
|
|
||||||
|
def _load_or_create_char_frequency(self):
|
||||||
"""
|
"""
|
||||||
加载或创建汉字频率字典
|
加载或创建汉字频率字典
|
||||||
"""
|
"""
|
||||||
@@ -88,7 +61,7 @@ def load_or_create_char_frequency():
|
|||||||
word, freq = line.strip().split()[:2]
|
word, freq = line.strip().split()[:2]
|
||||||
# 对词中的每个字进行频率累加
|
# 对词中的每个字进行频率累加
|
||||||
for char in word:
|
for char in word:
|
||||||
if is_chinese_char(char):
|
if self._is_chinese_char(char):
|
||||||
char_freq[char] += int(freq)
|
char_freq[char] += int(freq)
|
||||||
|
|
||||||
# 归一化频率值
|
# 归一化频率值
|
||||||
@@ -101,8 +74,7 @@ def load_or_create_char_frequency():
|
|||||||
|
|
||||||
return normalized_freq
|
return normalized_freq
|
||||||
|
|
||||||
# 创建拼音到汉字的映射字典
|
def _create_pinyin_dict(self):
|
||||||
def create_pinyin_dict():
|
|
||||||
"""
|
"""
|
||||||
创建拼音到汉字的映射字典
|
创建拼音到汉字的映射字典
|
||||||
"""
|
"""
|
||||||
@@ -120,7 +92,7 @@ def create_pinyin_dict():
|
|||||||
|
|
||||||
return pinyin_dict
|
return pinyin_dict
|
||||||
|
|
||||||
def is_chinese_char(char):
|
def _is_chinese_char(self, char):
|
||||||
"""
|
"""
|
||||||
判断是否为汉字
|
判断是否为汉字
|
||||||
"""
|
"""
|
||||||
@@ -129,11 +101,9 @@ def is_chinese_char(char):
|
|||||||
except:
|
except:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_pinyin(sentence):
|
def _get_pinyin(self, sentence):
|
||||||
"""
|
"""
|
||||||
将中文句子拆分成单个汉字并获取其拼音
|
将中文句子拆分成单个汉字并获取其拼音
|
||||||
:param sentence: 输入的中文句子
|
|
||||||
:return: 每个汉字及其拼音的列表
|
|
||||||
"""
|
"""
|
||||||
# 将句子拆分成单个字符
|
# 将句子拆分成单个字符
|
||||||
characters = list(sentence)
|
characters = list(sentence)
|
||||||
@@ -142,7 +112,7 @@ def get_pinyin(sentence):
|
|||||||
result = []
|
result = []
|
||||||
for char in characters:
|
for char in characters:
|
||||||
# 跳过空格和非汉字字符
|
# 跳过空格和非汉字字符
|
||||||
if char.isspace() or not is_chinese_char(char):
|
if char.isspace() or not self._is_chinese_char(char):
|
||||||
continue
|
continue
|
||||||
# 获取拼音(数字声调)
|
# 获取拼音(数字声调)
|
||||||
py = pinyin(char, style=Style.TONE3)[0][0]
|
py = pinyin(char, style=Style.TONE3)[0][0]
|
||||||
@@ -150,33 +120,9 @@ def get_pinyin(sentence):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_homophone(char, py, pinyin_dict, char_frequency, min_freq=5):
|
def _get_similar_tone_pinyin(self, py):
|
||||||
"""
|
|
||||||
获取同音字,按照使用频率排序
|
|
||||||
"""
|
|
||||||
homophones = pinyin_dict[py]
|
|
||||||
# 移除原字并过滤低频字
|
|
||||||
if char in homophones:
|
|
||||||
homophones.remove(char)
|
|
||||||
|
|
||||||
# 过滤掉低频字
|
|
||||||
homophones = [h for h in homophones if char_frequency.get(h, 0) >= min_freq]
|
|
||||||
|
|
||||||
# 按照字频排序
|
|
||||||
sorted_homophones = sorted(homophones,
|
|
||||||
key=lambda x: char_frequency.get(x, 0),
|
|
||||||
reverse=True)
|
|
||||||
|
|
||||||
# 只返回前10个同音字,避免输出过多
|
|
||||||
return sorted_homophones[:10]
|
|
||||||
|
|
||||||
def get_similar_tone_pinyin(py):
|
|
||||||
"""
|
"""
|
||||||
获取相似声调的拼音
|
获取相似声调的拼音
|
||||||
例如:'ni3' 可能返回 'ni2' 或 'ni4'
|
|
||||||
处理特殊情况:
|
|
||||||
1. 轻声(如 'de5' 或 'le')
|
|
||||||
2. 非数字结尾的拼音
|
|
||||||
"""
|
"""
|
||||||
# 检查拼音是否为空或无效
|
# 检查拼音是否为空或无效
|
||||||
if not py or len(py) < 1:
|
if not py or len(py) < 1:
|
||||||
@@ -200,50 +146,45 @@ def get_similar_tone_pinyin(py):
|
|||||||
new_tone = random.choice(possible_tones) # 随机选择一个新声调
|
new_tone = random.choice(possible_tones) # 随机选择一个新声调
|
||||||
return base + str(new_tone)
|
return base + str(new_tone)
|
||||||
|
|
||||||
def calculate_replacement_probability(orig_freq, target_freq, max_freq_diff=200):
|
def _calculate_replacement_probability(self, orig_freq, target_freq):
|
||||||
"""
|
"""
|
||||||
根据频率差计算替换概率
|
根据频率差计算替换概率
|
||||||
频率差越大,概率越低
|
|
||||||
:param orig_freq: 原字频率
|
|
||||||
:param target_freq: 目标字频率
|
|
||||||
:param max_freq_diff: 最大允许的频率差
|
|
||||||
:return: 0-1之间的概率值
|
|
||||||
"""
|
"""
|
||||||
if target_freq > orig_freq:
|
if target_freq > orig_freq:
|
||||||
return 1.0 # 如果替换字频率更高,保持原有概率
|
return 1.0 # 如果替换字频率更高,保持原有概率
|
||||||
|
|
||||||
freq_diff = orig_freq - target_freq
|
freq_diff = orig_freq - target_freq
|
||||||
if freq_diff > max_freq_diff:
|
if freq_diff > self.max_freq_diff:
|
||||||
return 0.0 # 频率差太大,不替换
|
return 0.0 # 频率差太大,不替换
|
||||||
|
|
||||||
# 使用指数衰减函数计算概率
|
# 使用指数衰减函数计算概率
|
||||||
# 频率差为0时概率为1,频率差为max_freq_diff时概率接近0
|
# 频率差为0时概率为1,频率差为max_freq_diff时概率接近0
|
||||||
return math.exp(-3 * freq_diff / max_freq_diff)
|
return math.exp(-3 * freq_diff / self.max_freq_diff)
|
||||||
|
|
||||||
def get_similar_frequency_chars(char, py, pinyin_dict, char_frequency, num_candidates=5, min_freq=5, tone_error_rate=0.2):
|
def _get_similar_frequency_chars(self, char, py, num_candidates=5):
|
||||||
"""
|
"""
|
||||||
获取与给定字频率相近的同音字,可能包含声调错误
|
获取与给定字频率相近的同音字,可能包含声调错误
|
||||||
"""
|
"""
|
||||||
homophones = []
|
homophones = []
|
||||||
|
|
||||||
# 有20%的概率使用错误声调
|
# 有一定概率使用错误声调
|
||||||
if random.random() < tone_error_rate:
|
if random.random() < self.tone_error_rate:
|
||||||
wrong_tone_py = get_similar_tone_pinyin(py)
|
wrong_tone_py = self._get_similar_tone_pinyin(py)
|
||||||
homophones.extend(pinyin_dict[wrong_tone_py])
|
homophones.extend(self.pinyin_dict[wrong_tone_py])
|
||||||
|
|
||||||
# 添加正确声调的同音字
|
# 添加正确声调的同音字
|
||||||
homophones.extend(pinyin_dict[py])
|
homophones.extend(self.pinyin_dict[py])
|
||||||
|
|
||||||
if not homophones:
|
if not homophones:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 获取原字的频率
|
# 获取原字的频率
|
||||||
orig_freq = char_frequency.get(char, 0)
|
orig_freq = self.char_frequency.get(char, 0)
|
||||||
|
|
||||||
# 计算所有同音字与原字的频率差,并过滤掉低频字
|
# 计算所有同音字与原字的频率差,并过滤掉低频字
|
||||||
freq_diff = [(h, char_frequency.get(h, 0))
|
freq_diff = [(h, self.char_frequency.get(h, 0))
|
||||||
for h in homophones
|
for h in homophones
|
||||||
if h != char and char_frequency.get(h, 0) >= min_freq]
|
if h != char and self.char_frequency.get(h, 0) >= self.min_freq]
|
||||||
|
|
||||||
if not freq_diff:
|
if not freq_diff:
|
||||||
return None
|
return None
|
||||||
@@ -251,7 +192,7 @@ def get_similar_frequency_chars(char, py, pinyin_dict, char_frequency, num_candi
|
|||||||
# 计算每个候选字的替换概率
|
# 计算每个候选字的替换概率
|
||||||
candidates_with_prob = []
|
candidates_with_prob = []
|
||||||
for h, freq in freq_diff:
|
for h, freq in freq_diff:
|
||||||
prob = calculate_replacement_probability(orig_freq, freq)
|
prob = self._calculate_replacement_probability(orig_freq, freq)
|
||||||
if prob > 0: # 只保留有效概率的候选字
|
if prob > 0: # 只保留有效概率的候选字
|
||||||
candidates_with_prob.append((h, prob))
|
candidates_with_prob.append((h, prob))
|
||||||
|
|
||||||
@@ -264,41 +205,32 @@ def get_similar_frequency_chars(char, py, pinyin_dict, char_frequency, num_candi
|
|||||||
# 返回概率最高的几个字
|
# 返回概率最高的几个字
|
||||||
return [char for char, _ in candidates_with_prob[:num_candidates]]
|
return [char for char, _ in candidates_with_prob[:num_candidates]]
|
||||||
|
|
||||||
def get_word_pinyin(word):
|
def _get_word_pinyin(self, word):
|
||||||
"""
|
"""
|
||||||
获取词语的拼音列表
|
获取词语的拼音列表
|
||||||
"""
|
"""
|
||||||
return [py[0] for py in pinyin(word, style=Style.TONE3)]
|
return [py[0] for py in pinyin(word, style=Style.TONE3)]
|
||||||
|
|
||||||
def segment_sentence(sentence):
|
def _segment_sentence(self, sentence):
|
||||||
"""
|
"""
|
||||||
使用jieba分词,返回词语列表
|
使用jieba分词,返回词语列表
|
||||||
"""
|
"""
|
||||||
return list(jieba.cut(sentence))
|
return list(jieba.cut(sentence))
|
||||||
|
|
||||||
def get_word_homophones(word, pinyin_dict, char_frequency, min_freq=5):
|
def _get_word_homophones(self, word):
|
||||||
"""
|
"""
|
||||||
获取整个词的同音词,只返回高频的有意义词语
|
获取整个词的同音词,只返回高频的有意义词语
|
||||||
:param word: 输入词语
|
|
||||||
:param pinyin_dict: 拼音字典
|
|
||||||
:param char_frequency: 字频字典
|
|
||||||
:param min_freq: 最小频率阈值
|
|
||||||
:return: 同音词列表
|
|
||||||
"""
|
"""
|
||||||
if len(word) == 1:
|
if len(word) == 1:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# 获取词的拼音
|
# 获取词的拼音
|
||||||
word_pinyin = get_word_pinyin(word)
|
word_pinyin = self._get_word_pinyin(word)
|
||||||
word_pinyin_str = ''.join(word_pinyin)
|
|
||||||
|
|
||||||
# 创建词语频率字典
|
|
||||||
word_freq = defaultdict(float)
|
|
||||||
|
|
||||||
# 遍历所有可能的同音字组合
|
# 遍历所有可能的同音字组合
|
||||||
candidates = []
|
candidates = []
|
||||||
for py in word_pinyin:
|
for py in word_pinyin:
|
||||||
chars = pinyin_dict.get(py, [])
|
chars = self.pinyin_dict.get(py, [])
|
||||||
if not chars:
|
if not chars:
|
||||||
return []
|
return []
|
||||||
candidates.append(chars)
|
candidates.append(chars)
|
||||||
@@ -331,50 +263,56 @@ def get_word_homophones(word, pinyin_dict, char_frequency, min_freq=5):
|
|||||||
# 只保留词频达到阈值的词
|
# 只保留词频达到阈值的词
|
||||||
if new_word_freq >= min_word_freq:
|
if new_word_freq >= min_word_freq:
|
||||||
# 计算词的平均字频(考虑字频和词频)
|
# 计算词的平均字频(考虑字频和词频)
|
||||||
char_avg_freq = sum(char_frequency.get(c, 0) for c in new_word) / len(new_word)
|
char_avg_freq = sum(self.char_frequency.get(c, 0) for c in new_word) / len(new_word)
|
||||||
# 综合评分:结合词频和字频
|
# 综合评分:结合词频和字频
|
||||||
combined_score = (new_word_freq * 0.7 + char_avg_freq * 0.3)
|
combined_score = (new_word_freq * 0.7 + char_avg_freq * 0.3)
|
||||||
if combined_score >= min_freq:
|
if combined_score >= self.min_freq:
|
||||||
homophones.append((new_word, combined_score))
|
homophones.append((new_word, combined_score))
|
||||||
|
|
||||||
# 按综合分数排序并限制返回数量
|
# 按综合分数排序并限制返回数量
|
||||||
sorted_homophones = sorted(homophones, key=lambda x: x[1], reverse=True)
|
sorted_homophones = sorted(homophones, key=lambda x: x[1], reverse=True)
|
||||||
return [word for word, _ in sorted_homophones[:5]] # 限制返回前5个结果
|
return [word for word, _ in sorted_homophones[:5]] # 限制返回前5个结果
|
||||||
|
|
||||||
def create_typo_sentence(sentence, pinyin_dict, char_frequency, error_rate=0.5, min_freq=5, tone_error_rate=0.2, word_replace_rate=0.3):
|
def create_typo_sentence(self, sentence):
|
||||||
"""
|
"""
|
||||||
创建包含同音字错误的句子,支持词语级别和字级别的替换
|
创建包含同音字错误的句子,支持词语级别和字级别的替换
|
||||||
只使用高频的有意义词语进行替换
|
|
||||||
|
参数:
|
||||||
|
sentence: 输入的中文句子
|
||||||
|
|
||||||
|
返回:
|
||||||
|
typo_sentence: 包含错别字的句子
|
||||||
|
typo_info: 错别字信息列表
|
||||||
"""
|
"""
|
||||||
result = []
|
result = []
|
||||||
typo_info = []
|
typo_info = []
|
||||||
|
|
||||||
# 分词
|
# 分词
|
||||||
words = segment_sentence(sentence)
|
words = self._segment_sentence(sentence)
|
||||||
|
|
||||||
for word in words:
|
for word in words:
|
||||||
# 如果是标点符号或空格,直接添加
|
# 如果是标点符号或空格,直接添加
|
||||||
if all(not is_chinese_char(c) for c in word):
|
if all(not self._is_chinese_char(c) for c in word):
|
||||||
result.append(word)
|
result.append(word)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 获取词语的拼音
|
# 获取词语的拼音
|
||||||
word_pinyin = get_word_pinyin(word)
|
word_pinyin = self._get_word_pinyin(word)
|
||||||
|
|
||||||
# 尝试整词替换
|
# 尝试整词替换
|
||||||
if len(word) > 1 and random.random() < word_replace_rate:
|
if len(word) > 1 and random.random() < self.word_replace_rate:
|
||||||
word_homophones = get_word_homophones(word, pinyin_dict, char_frequency, min_freq)
|
word_homophones = self._get_word_homophones(word)
|
||||||
if word_homophones:
|
if word_homophones:
|
||||||
typo_word = random.choice(word_homophones)
|
typo_word = random.choice(word_homophones)
|
||||||
# 计算词的平均频率
|
# 计算词的平均频率
|
||||||
orig_freq = sum(char_frequency.get(c, 0) for c in word) / len(word)
|
orig_freq = sum(self.char_frequency.get(c, 0) for c in word) / len(word)
|
||||||
typo_freq = sum(char_frequency.get(c, 0) for c in typo_word) / len(typo_word)
|
typo_freq = sum(self.char_frequency.get(c, 0) for c in typo_word) / len(typo_word)
|
||||||
|
|
||||||
# 添加到结果中
|
# 添加到结果中
|
||||||
result.append(typo_word)
|
result.append(typo_word)
|
||||||
typo_info.append((word, typo_word,
|
typo_info.append((word, typo_word,
|
||||||
' '.join(word_pinyin),
|
' '.join(word_pinyin),
|
||||||
' '.join(get_word_pinyin(typo_word)),
|
' '.join(self._get_word_pinyin(typo_word)),
|
||||||
orig_freq, typo_freq))
|
orig_freq, typo_freq))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -382,14 +320,13 @@ def create_typo_sentence(sentence, pinyin_dict, char_frequency, error_rate=0.5,
|
|||||||
if len(word) == 1:
|
if len(word) == 1:
|
||||||
char = word
|
char = word
|
||||||
py = word_pinyin[0]
|
py = word_pinyin[0]
|
||||||
if random.random() < error_rate:
|
if random.random() < self.error_rate:
|
||||||
similar_chars = get_similar_frequency_chars(char, py, pinyin_dict, char_frequency,
|
similar_chars = self._get_similar_frequency_chars(char, py)
|
||||||
min_freq=min_freq, tone_error_rate=tone_error_rate)
|
|
||||||
if similar_chars:
|
if similar_chars:
|
||||||
typo_char = random.choice(similar_chars)
|
typo_char = random.choice(similar_chars)
|
||||||
typo_freq = char_frequency.get(typo_char, 0)
|
typo_freq = self.char_frequency.get(typo_char, 0)
|
||||||
orig_freq = char_frequency.get(char, 0)
|
orig_freq = self.char_frequency.get(char, 0)
|
||||||
replace_prob = calculate_replacement_probability(orig_freq, typo_freq)
|
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
|
||||||
if random.random() < replace_prob:
|
if random.random() < replace_prob:
|
||||||
result.append(typo_char)
|
result.append(typo_char)
|
||||||
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
||||||
@@ -401,16 +338,15 @@ def create_typo_sentence(sentence, pinyin_dict, char_frequency, error_rate=0.5,
|
|||||||
word_result = []
|
word_result = []
|
||||||
for i, (char, py) in enumerate(zip(word, word_pinyin)):
|
for i, (char, py) in enumerate(zip(word, word_pinyin)):
|
||||||
# 词中的字替换概率降低
|
# 词中的字替换概率降低
|
||||||
word_error_rate = error_rate * (0.7 ** (len(word) - 1))
|
word_error_rate = self.error_rate * (0.7 ** (len(word) - 1))
|
||||||
|
|
||||||
if random.random() < word_error_rate:
|
if random.random() < word_error_rate:
|
||||||
similar_chars = get_similar_frequency_chars(char, py, pinyin_dict, char_frequency,
|
similar_chars = self._get_similar_frequency_chars(char, py)
|
||||||
min_freq=min_freq, tone_error_rate=tone_error_rate)
|
|
||||||
if similar_chars:
|
if similar_chars:
|
||||||
typo_char = random.choice(similar_chars)
|
typo_char = random.choice(similar_chars)
|
||||||
typo_freq = char_frequency.get(typo_char, 0)
|
typo_freq = self.char_frequency.get(typo_char, 0)
|
||||||
orig_freq = char_frequency.get(char, 0)
|
orig_freq = self.char_frequency.get(char, 0)
|
||||||
replace_prob = calculate_replacement_probability(orig_freq, typo_freq)
|
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
|
||||||
if random.random() < replace_prob:
|
if random.random() < replace_prob:
|
||||||
word_result.append(typo_char)
|
word_result.append(typo_char)
|
||||||
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
|
||||||
@@ -421,35 +357,20 @@ def create_typo_sentence(sentence, pinyin_dict, char_frequency, error_rate=0.5,
|
|||||||
|
|
||||||
return ''.join(result), typo_info
|
return ''.join(result), typo_info
|
||||||
|
|
||||||
def format_frequency(freq):
|
def format_typo_info(self, typo_info):
|
||||||
"""
|
"""
|
||||||
格式化频率显示
|
格式化错别字信息
|
||||||
|
|
||||||
|
参数:
|
||||||
|
typo_info: 错别字信息列表
|
||||||
|
|
||||||
|
返回:
|
||||||
|
格式化后的错别字信息字符串
|
||||||
"""
|
"""
|
||||||
return f"{freq:.2f}"
|
if not typo_info:
|
||||||
|
return "未生成错别字"
|
||||||
|
|
||||||
def main():
|
result = []
|
||||||
# 记录开始时间
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# 首先创建拼音字典和加载字频统计
|
|
||||||
print("正在加载汉字数据库,请稍候...")
|
|
||||||
pinyin_dict = create_pinyin_dict()
|
|
||||||
char_frequency = load_or_create_char_frequency()
|
|
||||||
|
|
||||||
# 获取用户输入
|
|
||||||
sentence = input("请输入中文句子:")
|
|
||||||
|
|
||||||
# 创建包含错别字的句子
|
|
||||||
typo_sentence, typo_info = create_typo_sentence(sentence, pinyin_dict, char_frequency,
|
|
||||||
error_rate=0.3, min_freq=5,
|
|
||||||
tone_error_rate=0.2, word_replace_rate=0.3)
|
|
||||||
|
|
||||||
# 打印结果
|
|
||||||
print("\n原句:", sentence)
|
|
||||||
print("错字版:", typo_sentence)
|
|
||||||
|
|
||||||
if typo_info:
|
|
||||||
print("\n错别字信息:")
|
|
||||||
for orig, typo, orig_py, typo_py, orig_freq, typo_freq in typo_info:
|
for orig, typo, orig_py, typo_py, orig_freq, typo_freq in typo_info:
|
||||||
# 判断是否为词语替换
|
# 判断是否为词语替换
|
||||||
is_word = ' ' in orig_py
|
is_word = ' ' in orig_py
|
||||||
@@ -459,25 +380,53 @@ def main():
|
|||||||
tone_error = orig_py[:-1] == typo_py[:-1] and orig_py[-1] != typo_py[-1]
|
tone_error = orig_py[:-1] == typo_py[:-1] and orig_py[-1] != typo_py[-1]
|
||||||
error_type = "声调错误" if tone_error else "同音字替换"
|
error_type = "声调错误" if tone_error else "同音字替换"
|
||||||
|
|
||||||
print(f"原文:{orig}({orig_py}) [频率:{format_frequency(orig_freq)}] -> "
|
result.append(f"原文:{orig}({orig_py}) [频率:{orig_freq:.2f}] -> "
|
||||||
f"替换:{typo}({typo_py}) [频率:{format_frequency(typo_freq)}] [{error_type}]")
|
f"替换:{typo}({typo_py}) [频率:{typo_freq:.2f}] [{error_type}]")
|
||||||
|
|
||||||
# 获取拼音结果
|
return "\n".join(result)
|
||||||
result = get_pinyin(sentence)
|
|
||||||
|
|
||||||
# 打印完整拼音
|
def set_params(self, **kwargs):
|
||||||
print("\n完整拼音:")
|
"""
|
||||||
print(" ".join(py for _, py in result))
|
设置参数
|
||||||
|
|
||||||
# 打印词语分析
|
可设置参数:
|
||||||
print("\n词语分析:")
|
error_rate: 单字替换概率
|
||||||
words = segment_sentence(sentence)
|
min_freq: 最小字频阈值
|
||||||
for word in words:
|
tone_error_rate: 声调错误概率
|
||||||
if any(is_chinese_char(c) for c in word):
|
word_replace_rate: 整词替换概率
|
||||||
word_pinyin = get_word_pinyin(word)
|
max_freq_diff: 最大允许的频率差异
|
||||||
print(f"词语:{word}")
|
"""
|
||||||
print(f"拼音:{' '.join(word_pinyin)}")
|
for key, value in kwargs.items():
|
||||||
print("---")
|
if hasattr(self, key):
|
||||||
|
setattr(self, key, value)
|
||||||
|
print(f"参数 {key} 已设置为 {value}")
|
||||||
|
else:
|
||||||
|
print(f"警告: 参数 {key} 不存在")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 创建错别字生成器实例
|
||||||
|
typo_generator = ChineseTypoGenerator(
|
||||||
|
error_rate=0.03,
|
||||||
|
min_freq=7,
|
||||||
|
tone_error_rate=0.02,
|
||||||
|
word_replace_rate=0.3
|
||||||
|
)
|
||||||
|
|
||||||
|
# 获取用户输入
|
||||||
|
sentence = input("请输入中文句子:")
|
||||||
|
|
||||||
|
# 创建包含错别字的句子
|
||||||
|
start_time = time.time()
|
||||||
|
typo_sentence, typo_info = typo_generator.create_typo_sentence(sentence)
|
||||||
|
|
||||||
|
# 打印结果
|
||||||
|
print("\n原句:", sentence)
|
||||||
|
print("错字版:", typo_sentence)
|
||||||
|
|
||||||
|
# 打印错别字信息
|
||||||
|
if typo_info:
|
||||||
|
print("\n错别字信息:")
|
||||||
|
print(typo_generator.format_typo_info(typo_info))
|
||||||
|
|
||||||
# 计算并打印总耗时
|
# 计算并打印总耗时
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|||||||
Reference in New Issue
Block a user