v0.3.0 记忆和知识库
beta
This commit is contained in:
@@ -10,6 +10,8 @@ from .relationship_manager import relationship_manager
|
||||
from ..schedule.schedule_generator import bot_schedule
|
||||
from .willing_manager import willing_manager
|
||||
|
||||
from ..memory_system.memory import memory_graph
|
||||
|
||||
|
||||
# 获取驱动器
|
||||
driver = get_driver()
|
||||
@@ -23,6 +25,8 @@ Database.initialize(
|
||||
print("\033[1;32m[初始化配置和数据库完成]\033[0m")
|
||||
|
||||
|
||||
|
||||
|
||||
# 导入其他模块
|
||||
from .bot import ChatBot
|
||||
from .emoji_manager import emoji_manager
|
||||
|
||||
@@ -5,7 +5,7 @@ from .storage import MessageStorage
|
||||
from .llm_generator import LLMResponseGenerator
|
||||
from .message_stream import MessageStream, MessageStreamContainer
|
||||
from .topic_identifier import topic_identifier
|
||||
from random import random
|
||||
from random import random, choice
|
||||
from .emoji_manager import emoji_manager # 导入表情包管理器
|
||||
import time
|
||||
import os
|
||||
@@ -15,6 +15,7 @@ from .message import Message_Thinking # 导入 Message_Thinking 类
|
||||
from .relationship_manager import relationship_manager
|
||||
from .willing_manager import willing_manager # 导入意愿管理器
|
||||
from .utils import is_mentioned_bot_in_txt, calculate_typing_time
|
||||
from ..memory_system.memory import memory_graph
|
||||
|
||||
class ChatBot:
|
||||
def __init__(self, config: BotConfig):
|
||||
@@ -99,6 +100,11 @@ class ChatBot:
|
||||
topic = topic_identifier.identify_topic_jieba(message.processed_plain_text)
|
||||
print(f"\033[1;32m[主题识别]\033[0m 主题: {topic}")
|
||||
|
||||
if topic:
|
||||
for current_topic in topic:
|
||||
first_layer_items, second_layer_items = memory_graph.get_related_item(current_topic, depth=2)
|
||||
if first_layer_items:
|
||||
print(f"\033[1;32m[记忆检索-bot]\033[0m 有印象:{current_topic}")
|
||||
|
||||
await self.storage.store_message(message, topic[0] if topic else None)
|
||||
|
||||
|
||||
@@ -133,8 +133,8 @@ llm_config.DEEP_SEEK_BASE_URL = os.getenv('DEEP_SEEK_BASE_URL')
|
||||
if not global_config.enable_advance_output:
|
||||
logger.remove()
|
||||
|
||||
logging.getLogger('nonebot').handlers.clear()
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.WARNING) # 只输出 WARNING 及以上级别
|
||||
logging.getLogger('nonebot').addHandler(console_handler)
|
||||
logging.getLogger('nonebot').setLevel(logging.WARNING)
|
||||
# logging.getLogger('nonebot').handlers.clear()
|
||||
# console_handler = logging.StreamHandler()
|
||||
# console_handler.setLevel(logging.WARNING) # 只输出 WARNING 及以上级别
|
||||
# logging.getLogger('nonebot').addHandler(console_handler)
|
||||
# logging.getLogger('nonebot').setLevel(logging.WARNING)
|
||||
|
||||
186
src/plugins/chat/knowledege/knowledge_library.py
Normal file
186
src/plugins/chat/knowledege/knowledge_library.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import requests
|
||||
import time
|
||||
|
||||
# 添加项目根目录到 Python 路径
|
||||
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
|
||||
sys.path.append(root_path)
|
||||
|
||||
from src.common.database import Database
|
||||
from src.plugins.chat.config import llm_config
|
||||
|
||||
# 直接配置数据库连接信息
|
||||
Database.initialize(
|
||||
"127.0.0.1", # MongoDB 主机
|
||||
27017, # MongoDB 端口
|
||||
"MegBot" # 数据库名称
|
||||
)
|
||||
|
||||
class KnowledgeLibrary:
|
||||
def __init__(self):
|
||||
self.db = Database.get_instance()
|
||||
self.raw_info_dir = "data/raw_info"
|
||||
self._ensure_dirs()
|
||||
|
||||
def _ensure_dirs(self):
|
||||
"""确保必要的目录存在"""
|
||||
os.makedirs(self.raw_info_dir, exist_ok=True)
|
||||
|
||||
def get_embedding(self, text: str) -> list:
|
||||
"""获取文本的embedding向量"""
|
||||
url = "https://api.siliconflow.cn/v1/embeddings"
|
||||
payload = {
|
||||
"model": "BAAI/bge-m3",
|
||||
"input": text,
|
||||
"encoding_format": "float"
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {llm_config.SILICONFLOW_API_KEY}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
response = requests.post(url, json=payload, headers=headers)
|
||||
if response.status_code != 200:
|
||||
print(f"获取embedding失败: {response.text}")
|
||||
return None
|
||||
|
||||
return response.json()['data'][0]['embedding']
|
||||
|
||||
def process_files(self):
|
||||
"""处理raw_info目录下的所有txt文件"""
|
||||
for filename in os.listdir(self.raw_info_dir):
|
||||
if filename.endswith('.txt'):
|
||||
file_path = os.path.join(self.raw_info_dir, filename)
|
||||
self.process_single_file(file_path)
|
||||
|
||||
def process_single_file(self, file_path: str):
|
||||
"""处理单个文件"""
|
||||
try:
|
||||
# 检查文件是否已处理
|
||||
if self.db.db.processed_files.find_one({"file_path": file_path}):
|
||||
print(f"文件已处理过,跳过: {file_path}")
|
||||
return
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 按1024字符分段
|
||||
segments = [content[i:i+300] for i in range(0, len(content), 300)]
|
||||
|
||||
# 处理每个分段
|
||||
for segment in segments:
|
||||
if not segment.strip(): # 跳过空段
|
||||
continue
|
||||
|
||||
# 获取embedding
|
||||
embedding = self.get_embedding(segment)
|
||||
if not embedding:
|
||||
continue
|
||||
|
||||
# 存储到数据库
|
||||
doc = {
|
||||
"content": segment,
|
||||
"embedding": embedding,
|
||||
"file_path": file_path,
|
||||
"segment_length": len(segment)
|
||||
}
|
||||
|
||||
# 使用文本内容的哈希值作为唯一标识
|
||||
content_hash = hash(segment)
|
||||
|
||||
# 更新或插入文档
|
||||
self.db.db.knowledges.update_one(
|
||||
{"content_hash": content_hash},
|
||||
{"$set": doc},
|
||||
upsert=True
|
||||
)
|
||||
|
||||
# 记录文件已处理
|
||||
self.db.db.processed_files.insert_one({
|
||||
"file_path": file_path,
|
||||
"processed_time": time.time()
|
||||
})
|
||||
|
||||
print(f"成功处理文件: {file_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文件 {file_path} 时出错: {str(e)}")
|
||||
|
||||
def search_similar_segments(self, query: str, limit: int = 5) -> list:
|
||||
"""搜索与查询文本相似的片段"""
|
||||
query_embedding = self.get_embedding(query)
|
||||
if not query_embedding:
|
||||
return []
|
||||
|
||||
# 使用余弦相似度计算
|
||||
pipeline = [
|
||||
{
|
||||
"$addFields": {
|
||||
"dotProduct": {
|
||||
"$reduce": {
|
||||
"input": {"$range": [0, {"$size": "$embedding"}]},
|
||||
"initialValue": 0,
|
||||
"in": {
|
||||
"$add": [
|
||||
"$$value",
|
||||
{"$multiply": [
|
||||
{"$arrayElemAt": ["$embedding", "$$this"]},
|
||||
{"$arrayElemAt": [query_embedding, "$$this"]}
|
||||
]}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"magnitude1": {
|
||||
"$sqrt": {
|
||||
"$reduce": {
|
||||
"input": "$embedding",
|
||||
"initialValue": 0,
|
||||
"in": {"$add": ["$$value", {"$multiply": ["$$this", "$$this"]}]}
|
||||
}
|
||||
}
|
||||
},
|
||||
"magnitude2": {
|
||||
"$sqrt": {
|
||||
"$reduce": {
|
||||
"input": query_embedding,
|
||||
"initialValue": 0,
|
||||
"in": {"$add": ["$$value", {"$multiply": ["$$this", "$$this"]}]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$addFields": {
|
||||
"similarity": {
|
||||
"$divide": ["$dotProduct", {"$multiply": ["$magnitude1", "$magnitude2"]}]
|
||||
}
|
||||
}
|
||||
},
|
||||
{"$sort": {"similarity": -1}},
|
||||
{"$limit": limit},
|
||||
{"$project": {"content": 1, "similarity": 1, "file_path": 1}}
|
||||
]
|
||||
|
||||
results = list(self.db.db.knowledges.aggregate(pipeline))
|
||||
return results
|
||||
|
||||
# 创建单例实例
|
||||
knowledge_library = KnowledgeLibrary()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试知识库功能
|
||||
print("开始处理知识库文件...")
|
||||
knowledge_library.process_files()
|
||||
|
||||
# 测试搜索功能
|
||||
test_query = "麦麦评价一下僕と花"
|
||||
print(f"\n搜索与'{test_query}'相似的内容:")
|
||||
results = knowledge_library.search_similar_segments(test_query)
|
||||
for result in results:
|
||||
print(f"相似度: {result['similarity']:.4f}")
|
||||
print(f"内容: {result['content'][:100]}...")
|
||||
print("-" * 50)
|
||||
@@ -6,6 +6,9 @@ import os
|
||||
from .utils import get_embedding, combine_messages, get_recent_group_detailed_plain_text
|
||||
from ...common.database import Database
|
||||
from .config import global_config
|
||||
from .topic_identifier import topic_identifier
|
||||
from ..memory_system.memory import memory_graph
|
||||
from random import choice
|
||||
|
||||
# 获取当前文件的绝对路径
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -35,6 +38,59 @@ class PromptBuilder:
|
||||
Returns:
|
||||
str: 构建好的prompt
|
||||
"""
|
||||
|
||||
|
||||
memory_prompt = ''
|
||||
start_time = time.time() # 记录开始时间
|
||||
topic = topic_identifier.identify_topic_jieba(message_txt)
|
||||
# print(f"\033[1;32m[pb主题识别]\033[0m 主题: {topic}")
|
||||
|
||||
all_first_layer_items = [] # 存储所有第一层记忆
|
||||
all_second_layer_items = {} # 用字典存储每个topic的第二层记忆
|
||||
overlapping_second_layer = set() # 存储重叠的第二层记忆
|
||||
|
||||
if topic:
|
||||
# 遍历所有topic
|
||||
for current_topic in topic:
|
||||
first_layer_items, second_layer_items = memory_graph.get_related_item(current_topic, depth=2)
|
||||
if first_layer_items:
|
||||
print(f"\033[1;32m[pb记忆检索]\033[0m 主题 '{current_topic}' 的第一层记忆: {first_layer_items}")
|
||||
|
||||
# 记录第一层数据
|
||||
all_first_layer_items.extend(first_layer_items)
|
||||
|
||||
# 记录第二层数据
|
||||
all_second_layer_items[current_topic] = second_layer_items
|
||||
|
||||
# 检查是否有重叠的第二层数据
|
||||
for other_topic, other_second_layer in all_second_layer_items.items():
|
||||
if other_topic != current_topic:
|
||||
# 找到重叠的记忆
|
||||
overlap = set(second_layer_items) & set(other_second_layer)
|
||||
if overlap:
|
||||
print(f"\033[1;32m[pb记忆检索]\033[0m 发现主题 '{current_topic}' 和 '{other_topic}' 有共同的第二层记忆: {overlap}")
|
||||
overlapping_second_layer.update(overlap)
|
||||
|
||||
# 合并所有需要的记忆
|
||||
if all_first_layer_items:
|
||||
print(f"\033[1;32m[pb记忆检索]\033[0m 合并所有需要的记忆1: {all_first_layer_items}")
|
||||
if overlapping_second_layer:
|
||||
print(f"\033[1;32m[pb记忆检索]\033[0m 合并所有需要的记忆2: {list(overlapping_second_layer)}")
|
||||
|
||||
all_memories = all_first_layer_items + list(overlapping_second_layer)
|
||||
|
||||
if all_memories: # 只在列表非空时选择随机项
|
||||
random_item = choice(all_memories)
|
||||
memory_prompt = f"看到这些聊天,你想起来{random_item}\n"
|
||||
else:
|
||||
memory_prompt = "" # 如果没有记忆,则返回空字符串
|
||||
|
||||
end_time = time.time() # 记录结束时间
|
||||
print(f"\033[1;32m[回忆耗时]\033[0m 耗时: {(end_time - start_time):.3f}秒") # 输出耗时
|
||||
|
||||
|
||||
|
||||
|
||||
#先禁用关系
|
||||
if 0 > 30:
|
||||
relation_prompt = "关系特别特别好,你很喜欢喜欢他"
|
||||
@@ -55,12 +111,17 @@ class PromptBuilder:
|
||||
prompt_date = f'''今天是{current_date},现在是{current_time},你今天的日程是:\n{bot_schedule.today_schedule}\n你现在正在{bot_schedule_now_activity}\n'''
|
||||
|
||||
#知识构建
|
||||
start_time = time.time()
|
||||
|
||||
prompt_info = ''
|
||||
promt_info_prompt = ''
|
||||
prompt_info = self.get_prompt_info(message_txt,threshold=0.5)
|
||||
if prompt_info:
|
||||
prompt_info = f'''\n----------------------------------------------------\n你有以下这些[知识]:\n{prompt_info}\n请你记住上面的[知识],之后可能会用到\n----------------------------------------------------\n'''
|
||||
promt_info_prompt = '你有一些[知识],在上面可以参考。'
|
||||
|
||||
end_time = time.time()
|
||||
print(f"\033[1;32m[知识检索]\033[0m 耗时: {(end_time - start_time):.3f}秒")
|
||||
# print(f"\033[1;34m[调试]\033[0m 获取知识库内容结果: {prompt_info}")
|
||||
|
||||
|
||||
@@ -69,11 +130,13 @@ class PromptBuilder:
|
||||
chat_talking_prompt = ''
|
||||
if group_id:
|
||||
chat_talking_prompt = get_recent_group_detailed_plain_text(self.db, group_id, limit=global_config.MAX_CONTEXT_SIZE,combine = True)
|
||||
|
||||
chat_talking_prompt = f"以下是群里正在聊天的内容:\n{chat_talking_prompt}"
|
||||
# print(f"\033[1;34m[调试]\033[0m 已从数据库获取群 {group_id} 的消息记录:{chat_talking_prompt}")
|
||||
|
||||
#激活prompt构建
|
||||
activate_prompt = ''
|
||||
activate_prompt = f"以上是群里正在进行的聊天,昵称为 '{sender_name}' 的用户说的:{message_txt}。引起了你的注意,你和他{relation_prompt},你想要{relation_prompt_2}。"
|
||||
activate_prompt = f"以上是群里正在进行的聊天,{memory_prompt} 现在昵称为 '{sender_name}' 的用户说的:{message_txt}。引起了你的注意,你和他{relation_prompt},你想要{relation_prompt_2}。"
|
||||
|
||||
#检测机器人相关词汇
|
||||
bot_keywords = ['人机', 'bot', '机器', '入机', 'robot', '机器人']
|
||||
@@ -87,13 +150,12 @@ class PromptBuilder:
|
||||
prompt_personality = ''
|
||||
personality_choice = random.random()
|
||||
if personality_choice < 4/6: # 第一种人格
|
||||
prompt_personality = f'''你的网名叫{global_config.BOT_NICKNAME},是一个学习地质的女大学生,喜欢摄影,你会刷贴吧,你正在浏览qq群,{promt_info_prompt},
|
||||
{activate_prompt}
|
||||
prompt_personality = f'''{activate_prompt}你的网名叫{global_config.BOT_NICKNAME},是一个学习地质的女大学生,喜欢摄影,你会刷贴吧,你正在浏览qq群,{promt_info_prompt},
|
||||
现在请你给出日常且口语化的回复,平淡一些,尽量简短一些。{is_bot_prompt}
|
||||
请注意把握群里的聊天内容,不要回复的太有条理,可以有个性。'''
|
||||
elif personality_choice < 1: # 第二种人格
|
||||
prompt_personality = f'''你的网名叫{global_config.BOT_NICKNAME},是一个女大学生,你有黑色头发,你会刷小红书, 你正在浏览qq群,{promt_info_prompt},
|
||||
{activate_prompt}
|
||||
prompt_personality = f'''{activate_prompt}你的网名叫{global_config.BOT_NICKNAME},是一个女大学生,你有黑色头发,你会刷小红书, 你正在浏览qq群,{promt_info_prompt},
|
||||
|
||||
现在请你给出日常且口语化的回复,请表现你自己的见解,不要一昧迎合,尽量简短一些。{is_bot_prompt}
|
||||
请你表达自己的见解和观点。可以有个性。'''
|
||||
|
||||
@@ -108,7 +170,7 @@ class PromptBuilder:
|
||||
|
||||
|
||||
#额外信息要求
|
||||
extra_info = '''但是记得回复平淡一些,简短一些,不要过多提及自身的背景, 记住不要输出多余内容(包括前后缀,冒号和引号,括号,表情等),只需要输出回复内容就好,不要输出其他任何内容'''
|
||||
extra_info = '''但是记得回复平淡一些,简短一些,记住不要输出多余内容(包括前后缀,冒号和引号,括号,表情等),只需要输出回复内容就好,不要输出其他任何内容'''
|
||||
|
||||
|
||||
|
||||
@@ -116,7 +178,10 @@ class PromptBuilder:
|
||||
prompt = ""
|
||||
prompt += f"{prompt_info}\n"
|
||||
prompt += f"{prompt_date}\n"
|
||||
prompt += f"{chat_talking_prompt}\n"
|
||||
prompt += f"{chat_talking_prompt}\n"
|
||||
|
||||
# prompt += f"{memory_prompt}\n"
|
||||
|
||||
# prompt += f"{activate_prompt}\n"
|
||||
prompt += f"{prompt_personality}\n"
|
||||
prompt += f"{prompt_ger}\n"
|
||||
|
||||
Reference in New Issue
Block a user