""" 错别字生成器 - 基于拼音和字频的中文错别字生成工具 """ from pypinyin import pinyin, Style from collections import defaultdict import json import os import jieba from pathlib import Path import random import math import time class ChineseTypoGenerator: def __init__(self, error_rate=0.3, min_freq=5, tone_error_rate=0.2, word_replace_rate=0.3, max_freq_diff=200): """ 初始化错别字生成器 参数: error_rate: 单字替换概率 min_freq: 最小字频阈值 tone_error_rate: 声调错误概率 word_replace_rate: 整词替换概率 max_freq_diff: 最大允许的频率差异 """ self.error_rate = error_rate self.min_freq = min_freq self.tone_error_rate = tone_error_rate self.word_replace_rate = word_replace_rate self.max_freq_diff = max_freq_diff # 加载数据 print("正在加载汉字数据库,请稍候...") self.pinyin_dict = self._create_pinyin_dict() self.char_frequency = self._load_or_create_char_frequency() def _load_or_create_char_frequency(self): """ 加载或创建汉字频率字典 """ cache_file = Path("char_frequency.json") # 如果缓存文件存在,直接加载 if cache_file.exists(): with open(cache_file, 'r', encoding='utf-8') as f: return json.load(f) # 使用内置的词频文件 char_freq = defaultdict(int) dict_path = os.path.join(os.path.dirname(jieba.__file__), 'dict.txt') # 读取jieba的词典文件 with open(dict_path, 'r', encoding='utf-8') as f: for line in f: word, freq = line.strip().split()[:2] # 对词中的每个字进行频率累加 for char in word: if self._is_chinese_char(char): char_freq[char] += int(freq) # 归一化频率值 max_freq = max(char_freq.values()) normalized_freq = {char: freq/max_freq * 1000 for char, freq in char_freq.items()} # 保存到缓存文件 with open(cache_file, 'w', encoding='utf-8') as f: json.dump(normalized_freq, f, ensure_ascii=False, indent=2) return normalized_freq def _create_pinyin_dict(self): """ 创建拼音到汉字的映射字典 """ # 常用汉字范围 chars = [chr(i) for i in range(0x4e00, 0x9fff)] pinyin_dict = defaultdict(list) # 为每个汉字建立拼音映射 for char in chars: try: py = pinyin(char, style=Style.TONE3)[0][0] pinyin_dict[py].append(char) except Exception: continue return pinyin_dict def _is_chinese_char(self, char): """ 判断是否为汉字 """ try: return '\u4e00' <= char <= '\u9fff' except: return False def _get_pinyin(self, sentence): """ 将中文句子拆分成单个汉字并获取其拼音 """ # 将句子拆分成单个字符 characters = list(sentence) # 获取每个字符的拼音 result = [] for char in characters: # 跳过空格和非汉字字符 if char.isspace() or not self._is_chinese_char(char): continue # 获取拼音(数字声调) py = pinyin(char, style=Style.TONE3)[0][0] result.append((char, py)) return result def _get_similar_tone_pinyin(self, py): """ 获取相似声调的拼音 """ # 检查拼音是否为空或无效 if not py or len(py) < 1: return py # 如果最后一个字符不是数字,说明可能是轻声或其他特殊情况 if not py[-1].isdigit(): # 为非数字结尾的拼音添加数字声调1 return py + '1' base = py[:-1] # 去掉声调 tone = int(py[-1]) # 获取声调 # 处理轻声(通常用5表示)或无效声调 if tone not in [1, 2, 3, 4]: return base + str(random.choice([1, 2, 3, 4])) # 正常处理声调 possible_tones = [1, 2, 3, 4] possible_tones.remove(tone) # 移除原声调 new_tone = random.choice(possible_tones) # 随机选择一个新声调 return base + str(new_tone) def _calculate_replacement_probability(self, orig_freq, target_freq): """ 根据频率差计算替换概率 """ if target_freq > orig_freq: return 1.0 # 如果替换字频率更高,保持原有概率 freq_diff = orig_freq - target_freq if freq_diff > self.max_freq_diff: return 0.0 # 频率差太大,不替换 # 使用指数衰减函数计算概率 # 频率差为0时概率为1,频率差为max_freq_diff时概率接近0 return math.exp(-3 * freq_diff / self.max_freq_diff) def _get_similar_frequency_chars(self, char, py, num_candidates=5): """ 获取与给定字频率相近的同音字,可能包含声调错误 """ homophones = [] # 有一定概率使用错误声调 if random.random() < self.tone_error_rate: wrong_tone_py = self._get_similar_tone_pinyin(py) homophones.extend(self.pinyin_dict[wrong_tone_py]) # 添加正确声调的同音字 homophones.extend(self.pinyin_dict[py]) if not homophones: return None # 获取原字的频率 orig_freq = self.char_frequency.get(char, 0) # 计算所有同音字与原字的频率差,并过滤掉低频字 freq_diff = [(h, self.char_frequency.get(h, 0)) for h in homophones if h != char and self.char_frequency.get(h, 0) >= self.min_freq] if not freq_diff: return None # 计算每个候选字的替换概率 candidates_with_prob = [] for h, freq in freq_diff: prob = self._calculate_replacement_probability(orig_freq, freq) if prob > 0: # 只保留有效概率的候选字 candidates_with_prob.append((h, prob)) if not candidates_with_prob: return None # 根据概率排序 candidates_with_prob.sort(key=lambda x: x[1], reverse=True) # 返回概率最高的几个字 return [char for char, _ in candidates_with_prob[:num_candidates]] def _get_word_pinyin(self, word): """ 获取词语的拼音列表 """ return [py[0] for py in pinyin(word, style=Style.TONE3)] def _segment_sentence(self, sentence): """ 使用jieba分词,返回词语列表 """ return list(jieba.cut(sentence)) def _get_word_homophones(self, word): """ 获取整个词的同音词,只返回高频的有意义词语 """ if len(word) == 1: return [] # 获取词的拼音 word_pinyin = self._get_word_pinyin(word) # 遍历所有可能的同音字组合 candidates = [] for py in word_pinyin: chars = self.pinyin_dict.get(py, []) if not chars: return [] candidates.append(chars) # 生成所有可能的组合 import itertools all_combinations = itertools.product(*candidates) # 获取jieba词典和词频信息 dict_path = os.path.join(os.path.dirname(jieba.__file__), 'dict.txt') valid_words = {} # 改用字典存储词语及其频率 with open(dict_path, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split() if len(parts) >= 2: word_text = parts[0] word_freq = float(parts[1]) # 获取词频 valid_words[word_text] = word_freq # 获取原词的词频作为参考 original_word_freq = valid_words.get(word, 0) min_word_freq = original_word_freq * 0.1 # 设置最小词频为原词频的10% # 过滤和计算频率 homophones = [] for combo in all_combinations: new_word = ''.join(combo) if new_word != word and new_word in valid_words: new_word_freq = valid_words[new_word] # 只保留词频达到阈值的词 if new_word_freq >= min_word_freq: # 计算词的平均字频(考虑字频和词频) char_avg_freq = sum(self.char_frequency.get(c, 0) for c in new_word) / len(new_word) # 综合评分:结合词频和字频 combined_score = (new_word_freq * 0.7 + char_avg_freq * 0.3) if combined_score >= self.min_freq: homophones.append((new_word, combined_score)) # 按综合分数排序并限制返回数量 sorted_homophones = sorted(homophones, key=lambda x: x[1], reverse=True) return [word for word, _ in sorted_homophones[:5]] # 限制返回前5个结果 def create_typo_sentence(self, sentence): """ 创建包含同音字错误的句子,支持词语级别和字级别的替换 参数: sentence: 输入的中文句子 返回: typo_sentence: 包含错别字的句子 typo_info: 错别字信息列表 """ result = [] typo_info = [] # 分词 words = self._segment_sentence(sentence) for word in words: # 如果是标点符号或空格,直接添加 if all(not self._is_chinese_char(c) for c in word): result.append(word) continue # 获取词语的拼音 word_pinyin = self._get_word_pinyin(word) # 尝试整词替换 if len(word) > 1 and random.random() < self.word_replace_rate: word_homophones = self._get_word_homophones(word) if word_homophones: typo_word = random.choice(word_homophones) # 计算词的平均频率 orig_freq = sum(self.char_frequency.get(c, 0) for c in word) / len(word) typo_freq = sum(self.char_frequency.get(c, 0) for c in typo_word) / len(typo_word) # 添加到结果中 result.append(typo_word) typo_info.append((word, typo_word, ' '.join(word_pinyin), ' '.join(self._get_word_pinyin(typo_word)), orig_freq, typo_freq)) continue # 如果不进行整词替换,则进行单字替换 if len(word) == 1: char = word py = word_pinyin[0] if random.random() < self.error_rate: similar_chars = self._get_similar_frequency_chars(char, py) if similar_chars: typo_char = random.choice(similar_chars) typo_freq = self.char_frequency.get(typo_char, 0) orig_freq = self.char_frequency.get(char, 0) replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq) if random.random() < replace_prob: result.append(typo_char) typo_py = pinyin(typo_char, style=Style.TONE3)[0][0] typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq)) continue result.append(char) else: # 处理多字词的单字替换 word_result = [] for i, (char, py) in enumerate(zip(word, word_pinyin)): # 词中的字替换概率降低 word_error_rate = self.error_rate * (0.7 ** (len(word) - 1)) if random.random() < word_error_rate: similar_chars = self._get_similar_frequency_chars(char, py) if similar_chars: typo_char = random.choice(similar_chars) typo_freq = self.char_frequency.get(typo_char, 0) orig_freq = self.char_frequency.get(char, 0) replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq) if random.random() < replace_prob: word_result.append(typo_char) typo_py = pinyin(typo_char, style=Style.TONE3)[0][0] typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq)) continue word_result.append(char) result.append(''.join(word_result)) return ''.join(result), typo_info def format_typo_info(self, typo_info): """ 格式化错别字信息 参数: typo_info: 错别字信息列表 返回: 格式化后的错别字信息字符串 """ if not typo_info: return "未生成错别字" result = [] for orig, typo, orig_py, typo_py, orig_freq, typo_freq in typo_info: # 判断是否为词语替换 is_word = ' ' in orig_py if is_word: error_type = "整词替换" else: tone_error = orig_py[:-1] == typo_py[:-1] and orig_py[-1] != typo_py[-1] error_type = "声调错误" if tone_error else "同音字替换" result.append(f"原文:{orig}({orig_py}) [频率:{orig_freq:.2f}] -> " f"替换:{typo}({typo_py}) [频率:{typo_freq:.2f}] [{error_type}]") return "\n".join(result) def set_params(self, **kwargs): """ 设置参数 可设置参数: error_rate: 单字替换概率 min_freq: 最小字频阈值 tone_error_rate: 声调错误概率 word_replace_rate: 整词替换概率 max_freq_diff: 最大允许的频率差异 """ for key, value in kwargs.items(): if hasattr(self, key): setattr(self, key, value) print(f"参数 {key} 已设置为 {value}") else: print(f"警告: 参数 {key} 不存在") def main(): # 创建错别字生成器实例 typo_generator = ChineseTypoGenerator( error_rate=0.03, min_freq=7, tone_error_rate=0.02, word_replace_rate=0.3 ) # 获取用户输入 sentence = input("请输入中文句子:") # 创建包含错别字的句子 start_time = time.time() typo_sentence, typo_info = typo_generator.create_typo_sentence(sentence) # 打印结果 print("\n原句:", sentence) print("错字版:", typo_sentence) # 打印错别字信息 if typo_info: print("\n错别字信息:") print(typo_generator.format_typo_info(typo_info)) # 计算并打印总耗时 end_time = time.time() total_time = end_time - start_time print(f"\n总耗时:{total_time:.2f}秒") if __name__ == "__main__": main()