修复了记忆刷屏 加入了又新又好错别字生成器 增加了记忆过滤
This commit is contained in:
SengokuCola
2025-03-07 00:09:36 +08:00
parent 21d1a69b6e
commit 8ef00ee571
8 changed files with 883 additions and 547 deletions

View File

@@ -29,16 +29,6 @@ config = driver.config
class EmojiManager:
_instance = None
EMOJI_DIR = "data/emoji" # 表情包存储目录
EMOTION_KEYWORDS = {
'happy': ['开心', '快乐', '高兴', '欢喜', '', '喜悦', '兴奋', '愉快', '', ''],
'angry': ['生气', '愤怒', '恼火', '不爽', '火大', '', '气愤', '恼怒', '发火', '不满'],
'sad': ['伤心', '难过', '悲伤', '痛苦', '', '忧伤', '悲痛', '哀伤', '委屈', '失落'],
'surprised': ['惊讶', '震惊', '吃惊', '意外', '', '诧异', '惊奇', '惊喜', '不敢相信', '目瞪口呆'],
'disgusted': ['恶心', '讨厌', '厌恶', '反感', '嫌弃', '', '嫌恶', '憎恶', '不喜欢', ''],
'fearful': ['害怕', '恐惧', '惊恐', '担心', '', '惊吓', '惊慌', '畏惧', '胆怯', ''],
'neutral': ['普通', '一般', '还行', '正常', '平静', '平淡', '一般般', '凑合', '还好', '就这样']
}
def __new__(cls):
if cls._instance is None:

View File

@@ -84,7 +84,8 @@ class PromptBuilder:
relevant_memories = await hippocampus.get_relevant_memories(
text=message_txt,
max_topics=5,
similarity_threshold=0.4
similarity_threshold=0.4,
max_memory_num=5
)
if relevant_memories:

View File

@@ -13,6 +13,7 @@ from nonebot import get_driver
from ..models.utils_model import LLM_request
import aiohttp
import jieba
from ..utils.typo_generator import ChineseTypoGenerator
driver = get_driver()
config = driver.config
@@ -285,75 +286,6 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
print(f"处理后的句子: {sentences_done}")
return sentences_done
# 常见的错别字映射
TYPO_DICT = {
'': '地得',
'': '咯啦勒',
'': '嘛麻',
'': '八把罢',
'': '',
'': '再在',
'': '',
'': '',
'': '沃窝喔',
'': '泥尼拟',
'': '它她塔祂',
'': '',
'': '阿哇',
'': '呐捏',
'': '豆读毒',
'': '',
'': '回汇',
'': '趣取曲',
'': '作坐',
'': '相像',
'': '说税睡',
'': '砍堪刊',
'': '来莱赖',
'': '号毫豪',
'': '给既继',
'': '锅果裹',
'': '',
'': '位未',
'': '甚深伸',
'': '末麽嘛',
'': '话花划',
'': '织直值',
'': '',
'': '听停挺',
'': '见件建',
'': '觉脚搅',
'': '得德锝',
'': '着找招',
'': '向象想',
'': '等灯登',
'': '谢写卸',
'': '对队',
'': '里理鲤',
'': '啦拉喇',
'': '吃持迟',
'': '哦喔噢',
'': '呀压',
'': '',
'': '太抬台',
'': '',
'': '',
'': '以已',
'': '因应',
'': '啥沙傻',
'': '行型形',
'': '哈蛤铪',
'': '嘿黑嗨',
'': '嗯恩摁',
'': '哎爱埃',
'': '呜屋污',
'': '喂位未',
'': '嘛麻马',
'': '嗨害亥',
'': '哇娃蛙',
'': '咦意易',
'': '嘻西希'
}
def random_remove_punctuation(text: str) -> str:
"""随机处理标点符号,模拟人类打字习惯
@@ -381,17 +313,6 @@ def random_remove_punctuation(text: str) -> str:
result += char
return result
def add_typos(text: str) -> str:
TYPO_RATE = 0.02 # 控制错别字出现的概率(2%)
result = ""
for char in text:
if char in TYPO_DICT and random.random() < TYPO_RATE:
# 从可能的错别字中随机选择一个
typos = TYPO_DICT[char]
result += random.choice(typos)
else:
result += char
return result
def process_llm_response(text: str) -> List[str]:
# processed_response = process_text_with_typos(content)
@@ -399,7 +320,14 @@ def process_llm_response(text: str) -> List[str]:
print(f"回复过长 ({len(text)} 字符),返回默认回复")
return ['懒得说']
# 处理长消息
sentences = split_into_sentences_w_remove_punctuation(add_typos(text))
typo_generator = ChineseTypoGenerator(
error_rate=0.03,
min_freq=7,
tone_error_rate=0.2,
word_replace_rate=0.02
)
typoed_text = typo_generator.create_typo_sentence(text)[0]
sentences = split_into_sentences_w_remove_punctuation(typoed_text)
# 检查分割后的消息数量是否过多超过3条
if len(sentences) > 4:
print(f"分割后消息数量过多 ({len(sentences)} 条),返回默认回复")

View File

@@ -181,13 +181,19 @@ class Hippocampus:
topic_num = self.calculate_topic_num(input_text, compress_rate)
topics_response = await self.llm_model_get_topic.generate_response(self.find_topic_llm(input_text, topic_num))
# 修改话题处理逻辑
print(f"话题: {topics_response[0]}")
topics = [topic.strip() for topic in topics_response[0].replace("", ",").replace("", ",").replace(" ", ",").split(",") if topic.strip()]
print(f"话题: {topics}")
# 定义需要过滤的关键词
filter_keywords = ['表情包', '图片', '回复', '聊天记录']
# 创建所有话题的请求任务
# 过滤topics
topics = [topic.strip() for topic in topics_response[0].replace("", ",").replace("", ",").replace(" ", ",").split(",") if topic.strip()]
filtered_topics = [topic for topic in topics if not any(keyword in topic for keyword in filter_keywords)]
# print(f"原始话题: {topics}")
print(f"过滤后话题: {filtered_topics}")
# 使用过滤后的话题继续处理
tasks = []
for topic in topics:
for topic in filtered_topics:
topic_what_prompt = self.topic_what(input_text, topic)
# 创建异步任务
task = self.llm_model_summary.generate_response_async(topic_what_prompt)
@@ -501,9 +507,9 @@ class Hippocampus:
list: 识别出的主题列表
"""
topics_response = await self.llm_model_get_topic.generate_response(self.find_topic_llm(text, 5))
print(f"话题: {topics_response[0]}")
# print(f"话题: {topics_response[0]}")
topics = [topic.strip() for topic in topics_response[0].replace("", ",").replace("", ",").replace(" ", ",").split(",") if topic.strip()]
print(f"话题: {topics}")
# print(f"话题: {topics}")
return topics
@@ -579,7 +585,7 @@ class Hippocampus:
print(f"\033[1;32m[记忆激活]\033[0m 识别出的主题: {identified_topics}")
if not identified_topics:
print(f"\033[1;32m[记忆激活]\033[0m 未识别出主题,返回0")
# print(f"\033[1;32m[记忆激活]\033[0m 未识别出主题,返回0")
return 0
# 查找相似主题
@@ -644,7 +650,7 @@ class Hippocampus:
return int(activation)
async def get_relevant_memories(self, text: str, max_topics: int = 5, similarity_threshold: float = 0.4) -> list:
async def get_relevant_memories(self, text: str, max_topics: int = 5, similarity_threshold: float = 0.4, max_memory_num: int = 5) -> list:
"""根据输入文本获取相关的记忆内容"""
# 识别主题
identified_topics = await self._identify_topics(text)
@@ -665,6 +671,9 @@ class Hippocampus:
# 获取该主题的记忆内容
first_layer, _ = self.memory_graph.get_related_item(topic, depth=1)
if first_layer:
# 如果记忆条数超过限制,随机选择指定数量的记忆
if len(first_layer) > max_memory_num:
first_layer = random.sample(first_layer, max_memory_num)
# 为每条记忆添加来源主题和相似度信息
for memory in first_layer:
relevant_memories.append({

View File

@@ -234,16 +234,22 @@ class Hippocampus:
async def memory_compress(self, input_text, compress_rate=0.1):
print(input_text)
#获取topics
topic_num = self.calculate_topic_num(input_text, compress_rate)
topics_response = await self.llm_model_get_topic.generate_response_async(self.find_topic_llm(input_text, topic_num))
topics_response = self.llm_model_get_topic.generate_response(self.find_topic_llm(input_text, topic_num))
# 修改话题处理逻辑
# 定义需要过滤的关键词
filter_keywords = ['表情包', '图片', '回复', '聊天记录']
# 过滤topics
topics = [topic.strip() for topic in topics_response[0].replace("", ",").replace("", ",").replace(" ", ",").split(",") if topic.strip()]
print(f"话题: {topics}")
filtered_topics = [topic for topic in topics if not any(keyword in topic for keyword in filter_keywords)]
# print(f"原始话题: {topics}")
print(f"过滤后话题: {filtered_topics}")
# 创建所有话题的请求任务
tasks = []
for topic in topics:
for topic in filtered_topics:
topic_what_prompt = self.topic_what(input_text, topic)
# 创建异步任务
task = self.llm_model_small.generate_response_async(topic_what_prompt)
@@ -650,7 +656,22 @@ def visualize_graph_lite(memory_graph: Memory_graph, color_by_memory: bool = Fal
G = memory_graph.G
# 创建一个新图用于可视化
H = G.copy()
H = G.copy()
# 过滤掉内容数量小于2的节点
nodes_to_remove = []
for node in H.nodes():
memory_items = H.nodes[node].get('memory_items', [])
memory_count = len(memory_items) if isinstance(memory_items, list) else (1 if memory_items else 0)
if memory_count < 2:
nodes_to_remove.append(node)
H.remove_nodes_from(nodes_to_remove)
# 如果没有符合条件的节点,直接返回
if len(H.nodes()) == 0:
print("没有找到内容数量大于等于2的节点")
return
# 计算节点大小和颜色
node_colors = []
@@ -704,7 +725,7 @@ def visualize_graph_lite(memory_graph: Memory_graph, color_by_memory: bool = Fal
edge_color='gray',
width=1.5) # 统一的边宽度
title = '记忆图谱可视化 - 节点大小表示记忆数量\n节点颜色:蓝(弱连接)到红(强连接)渐变,边的透明度表示连接强度\n连接强度越大的节点距离越近'
title = '记忆图谱可视化仅显示内容≥2的节点\n节点大小表示记忆数量\n节点颜色:蓝(弱连接)到红(强连接)渐变,边的透明度表示连接强度\n连接强度越大的节点距离越近'
plt.title(title, fontsize=16, fontfamily='SimHei')
plt.show()

View File

@@ -0,0 +1,437 @@
"""
错别字生成器 - 基于拼音和字频的中文错别字生成工具
"""
from pypinyin import pinyin, Style
from collections import defaultdict
import json
import os
import jieba
from pathlib import Path
import random
import math
import time
class ChineseTypoGenerator:
def __init__(self,
error_rate=0.3,
min_freq=5,
tone_error_rate=0.2,
word_replace_rate=0.3,
max_freq_diff=200):
"""
初始化错别字生成器
参数:
error_rate: 单字替换概率
min_freq: 最小字频阈值
tone_error_rate: 声调错误概率
word_replace_rate: 整词替换概率
max_freq_diff: 最大允许的频率差异
"""
self.error_rate = error_rate
self.min_freq = min_freq
self.tone_error_rate = tone_error_rate
self.word_replace_rate = word_replace_rate
self.max_freq_diff = max_freq_diff
# 加载数据
print("正在加载汉字数据库,请稍候...")
self.pinyin_dict = self._create_pinyin_dict()
self.char_frequency = self._load_or_create_char_frequency()
def _load_or_create_char_frequency(self):
"""
加载或创建汉字频率字典
"""
cache_file = Path("char_frequency.json")
# 如果缓存文件存在,直接加载
if cache_file.exists():
with open(cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
# 使用内置的词频文件
char_freq = defaultdict(int)
dict_path = os.path.join(os.path.dirname(jieba.__file__), 'dict.txt')
# 读取jieba的词典文件
with open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, freq = line.strip().split()[:2]
# 对词中的每个字进行频率累加
for char in word:
if self._is_chinese_char(char):
char_freq[char] += int(freq)
# 归一化频率值
max_freq = max(char_freq.values())
normalized_freq = {char: freq/max_freq * 1000 for char, freq in char_freq.items()}
# 保存到缓存文件
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(normalized_freq, f, ensure_ascii=False, indent=2)
return normalized_freq
def _create_pinyin_dict(self):
"""
创建拼音到汉字的映射字典
"""
# 常用汉字范围
chars = [chr(i) for i in range(0x4e00, 0x9fff)]
pinyin_dict = defaultdict(list)
# 为每个汉字建立拼音映射
for char in chars:
try:
py = pinyin(char, style=Style.TONE3)[0][0]
pinyin_dict[py].append(char)
except Exception:
continue
return pinyin_dict
def _is_chinese_char(self, char):
"""
判断是否为汉字
"""
try:
return '\u4e00' <= char <= '\u9fff'
except:
return False
def _get_pinyin(self, sentence):
"""
将中文句子拆分成单个汉字并获取其拼音
"""
# 将句子拆分成单个字符
characters = list(sentence)
# 获取每个字符的拼音
result = []
for char in characters:
# 跳过空格和非汉字字符
if char.isspace() or not self._is_chinese_char(char):
continue
# 获取拼音(数字声调)
py = pinyin(char, style=Style.TONE3)[0][0]
result.append((char, py))
return result
def _get_similar_tone_pinyin(self, py):
"""
获取相似声调的拼音
"""
# 检查拼音是否为空或无效
if not py or len(py) < 1:
return py
# 如果最后一个字符不是数字,说明可能是轻声或其他特殊情况
if not py[-1].isdigit():
# 为非数字结尾的拼音添加数字声调1
return py + '1'
base = py[:-1] # 去掉声调
tone = int(py[-1]) # 获取声调
# 处理轻声通常用5表示或无效声调
if tone not in [1, 2, 3, 4]:
return base + str(random.choice([1, 2, 3, 4]))
# 正常处理声调
possible_tones = [1, 2, 3, 4]
possible_tones.remove(tone) # 移除原声调
new_tone = random.choice(possible_tones) # 随机选择一个新声调
return base + str(new_tone)
def _calculate_replacement_probability(self, orig_freq, target_freq):
"""
根据频率差计算替换概率
"""
if target_freq > orig_freq:
return 1.0 # 如果替换字频率更高,保持原有概率
freq_diff = orig_freq - target_freq
if freq_diff > self.max_freq_diff:
return 0.0 # 频率差太大,不替换
# 使用指数衰减函数计算概率
# 频率差为0时概率为1频率差为max_freq_diff时概率接近0
return math.exp(-3 * freq_diff / self.max_freq_diff)
def _get_similar_frequency_chars(self, char, py, num_candidates=5):
"""
获取与给定字频率相近的同音字,可能包含声调错误
"""
homophones = []
# 有一定概率使用错误声调
if random.random() < self.tone_error_rate:
wrong_tone_py = self._get_similar_tone_pinyin(py)
homophones.extend(self.pinyin_dict[wrong_tone_py])
# 添加正确声调的同音字
homophones.extend(self.pinyin_dict[py])
if not homophones:
return None
# 获取原字的频率
orig_freq = self.char_frequency.get(char, 0)
# 计算所有同音字与原字的频率差,并过滤掉低频字
freq_diff = [(h, self.char_frequency.get(h, 0))
for h in homophones
if h != char and self.char_frequency.get(h, 0) >= self.min_freq]
if not freq_diff:
return None
# 计算每个候选字的替换概率
candidates_with_prob = []
for h, freq in freq_diff:
prob = self._calculate_replacement_probability(orig_freq, freq)
if prob > 0: # 只保留有效概率的候选字
candidates_with_prob.append((h, prob))
if not candidates_with_prob:
return None
# 根据概率排序
candidates_with_prob.sort(key=lambda x: x[1], reverse=True)
# 返回概率最高的几个字
return [char for char, _ in candidates_with_prob[:num_candidates]]
def _get_word_pinyin(self, word):
"""
获取词语的拼音列表
"""
return [py[0] for py in pinyin(word, style=Style.TONE3)]
def _segment_sentence(self, sentence):
"""
使用jieba分词返回词语列表
"""
return list(jieba.cut(sentence))
def _get_word_homophones(self, word):
"""
获取整个词的同音词,只返回高频的有意义词语
"""
if len(word) == 1:
return []
# 获取词的拼音
word_pinyin = self._get_word_pinyin(word)
# 遍历所有可能的同音字组合
candidates = []
for py in word_pinyin:
chars = self.pinyin_dict.get(py, [])
if not chars:
return []
candidates.append(chars)
# 生成所有可能的组合
import itertools
all_combinations = itertools.product(*candidates)
# 获取jieba词典和词频信息
dict_path = os.path.join(os.path.dirname(jieba.__file__), 'dict.txt')
valid_words = {} # 改用字典存储词语及其频率
with open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
if len(parts) >= 2:
word_text = parts[0]
word_freq = float(parts[1]) # 获取词频
valid_words[word_text] = word_freq
# 获取原词的词频作为参考
original_word_freq = valid_words.get(word, 0)
min_word_freq = original_word_freq * 0.1 # 设置最小词频为原词频的10%
# 过滤和计算频率
homophones = []
for combo in all_combinations:
new_word = ''.join(combo)
if new_word != word and new_word in valid_words:
new_word_freq = valid_words[new_word]
# 只保留词频达到阈值的词
if new_word_freq >= min_word_freq:
# 计算词的平均字频(考虑字频和词频)
char_avg_freq = sum(self.char_frequency.get(c, 0) for c in new_word) / len(new_word)
# 综合评分:结合词频和字频
combined_score = (new_word_freq * 0.7 + char_avg_freq * 0.3)
if combined_score >= self.min_freq:
homophones.append((new_word, combined_score))
# 按综合分数排序并限制返回数量
sorted_homophones = sorted(homophones, key=lambda x: x[1], reverse=True)
return [word for word, _ in sorted_homophones[:5]] # 限制返回前5个结果
def create_typo_sentence(self, sentence):
"""
创建包含同音字错误的句子,支持词语级别和字级别的替换
参数:
sentence: 输入的中文句子
返回:
typo_sentence: 包含错别字的句子
typo_info: 错别字信息列表
"""
result = []
typo_info = []
# 分词
words = self._segment_sentence(sentence)
for word in words:
# 如果是标点符号或空格,直接添加
if all(not self._is_chinese_char(c) for c in word):
result.append(word)
continue
# 获取词语的拼音
word_pinyin = self._get_word_pinyin(word)
# 尝试整词替换
if len(word) > 1 and random.random() < self.word_replace_rate:
word_homophones = self._get_word_homophones(word)
if word_homophones:
typo_word = random.choice(word_homophones)
# 计算词的平均频率
orig_freq = sum(self.char_frequency.get(c, 0) for c in word) / len(word)
typo_freq = sum(self.char_frequency.get(c, 0) for c in typo_word) / len(typo_word)
# 添加到结果中
result.append(typo_word)
typo_info.append((word, typo_word,
' '.join(word_pinyin),
' '.join(self._get_word_pinyin(typo_word)),
orig_freq, typo_freq))
continue
# 如果不进行整词替换,则进行单字替换
if len(word) == 1:
char = word
py = word_pinyin[0]
if random.random() < self.error_rate:
similar_chars = self._get_similar_frequency_chars(char, py)
if similar_chars:
typo_char = random.choice(similar_chars)
typo_freq = self.char_frequency.get(typo_char, 0)
orig_freq = self.char_frequency.get(char, 0)
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
if random.random() < replace_prob:
result.append(typo_char)
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq))
continue
result.append(char)
else:
# 处理多字词的单字替换
word_result = []
for i, (char, py) in enumerate(zip(word, word_pinyin)):
# 词中的字替换概率降低
word_error_rate = self.error_rate * (0.7 ** (len(word) - 1))
if random.random() < word_error_rate:
similar_chars = self._get_similar_frequency_chars(char, py)
if similar_chars:
typo_char = random.choice(similar_chars)
typo_freq = self.char_frequency.get(typo_char, 0)
orig_freq = self.char_frequency.get(char, 0)
replace_prob = self._calculate_replacement_probability(orig_freq, typo_freq)
if random.random() < replace_prob:
word_result.append(typo_char)
typo_py = pinyin(typo_char, style=Style.TONE3)[0][0]
typo_info.append((char, typo_char, py, typo_py, orig_freq, typo_freq))
continue
word_result.append(char)
result.append(''.join(word_result))
return ''.join(result), typo_info
def format_typo_info(self, typo_info):
"""
格式化错别字信息
参数:
typo_info: 错别字信息列表
返回:
格式化后的错别字信息字符串
"""
if not typo_info:
return "未生成错别字"
result = []
for orig, typo, orig_py, typo_py, orig_freq, typo_freq in typo_info:
# 判断是否为词语替换
is_word = ' ' in orig_py
if is_word:
error_type = "整词替换"
else:
tone_error = orig_py[:-1] == typo_py[:-1] and orig_py[-1] != typo_py[-1]
error_type = "声调错误" if tone_error else "同音字替换"
result.append(f"原文:{orig}({orig_py}) [频率:{orig_freq:.2f}] -> "
f"替换:{typo}({typo_py}) [频率:{typo_freq:.2f}] [{error_type}]")
return "\n".join(result)
def set_params(self, **kwargs):
"""
设置参数
可设置参数:
error_rate: 单字替换概率
min_freq: 最小字频阈值
tone_error_rate: 声调错误概率
word_replace_rate: 整词替换概率
max_freq_diff: 最大允许的频率差异
"""
for key, value in kwargs.items():
if hasattr(self, key):
setattr(self, key, value)
print(f"参数 {key} 已设置为 {value}")
else:
print(f"警告: 参数 {key} 不存在")
def main():
# 创建错别字生成器实例
typo_generator = ChineseTypoGenerator(
error_rate=0.03,
min_freq=7,
tone_error_rate=0.02,
word_replace_rate=0.3
)
# 获取用户输入
sentence = input("请输入中文句子:")
# 创建包含错别字的句子
start_time = time.time()
typo_sentence, typo_info = typo_generator.create_typo_sentence(sentence)
# 打印结果
print("\n原句:", sentence)
print("错字版:", typo_sentence)
# 打印错别字信息
if typo_info:
print("\n错别字信息:")
print(typo_generator.format_typo_info(typo_info))
# 计算并打印总耗时
end_time = time.time()
total_time = end_time - start_time
print(f"\n总耗时:{total_time:.2f}")
if __name__ == "__main__":
main()