diff --git a/src/chat/focus_chat/info_processors/relationship_processor.py b/src/chat/focus_chat/info_processors/relationship_processor.py index 8f128d993..c96e5a645 100644 --- a/src/chat/focus_chat/info_processors/relationship_processor.py +++ b/src/chat/focus_chat/info_processors/relationship_processor.py @@ -23,10 +23,8 @@ def init_prompt(): 你和别人的关系信息是,请从这些信息中提取出你和别人的关系的原文: {relation_prompt} -请只从上面这些信息中提取出。 +请只从上面这些信息中提取出内容。 - -现在是{time_now},你正在上网,和qq群里的网友们聊天,以下是正在进行的聊天内容: {chat_observe_info} 现在请你根据现有的信息,总结你和群里的人的关系 diff --git a/src/chat/normal_chat/normal_chat.py b/src/chat/normal_chat/normal_chat.py index 11ed253f4..8c6119b93 100644 --- a/src/chat/normal_chat/normal_chat.py +++ b/src/chat/normal_chat/normal_chat.py @@ -383,11 +383,14 @@ class NormalChat: logger.error(f"[{self.stream_name}] 动作规划异常: {plan_result}") elif plan_result: logger.debug(f"[{self.stream_name}] 额外动作处理完成: {plan_result['action_type']}") - + if not response_set or ( self.enable_planner and self.action_type not in ["no_action", "change_to_focus_chat"] ): - logger.info(f"[{self.stream_name}] 模型未生成回复内容") + if not response_set: + logger.info(f"[{self.stream_name}] 模型未生成回复内容") + elif self.enable_planner and self.action_type not in ["no_action", "change_to_focus_chat"]: + logger.info(f"[{self.stream_name}] 模型选择其他动作") # 如果模型未生成回复,移除思考消息 container = await message_manager.get_container(self.stream_id) # 使用 self.stream_id for msg in container.messages[:]: @@ -443,7 +446,7 @@ class NormalChat: logger.warning(f"[{self.stream_name}] 没有设置切换到focus聊天模式的回调函数,无法执行切换") return else: - # await self._check_switch_to_focus() + await self._check_switch_to_focus() pass info_catcher.done_catch() diff --git a/src/chat/normal_chat/normal_chat_generator.py b/src/chat/normal_chat/normal_chat_generator.py index ae1f1109e..ad6bab74c 100644 --- a/src/chat/normal_chat/normal_chat_generator.py +++ b/src/chat/normal_chat/normal_chat_generator.py @@ -57,7 +57,7 @@ class NormalChatGenerator: ) if model_response: - logger.debug(f"{global_config.bot.nickname}的原始回复是:{model_response}") + logger.debug(f"{global_config.bot.nickname}的备选回复是:{model_response}") model_response = process_llm_response(model_response) return model_response diff --git a/src/person_info/relationship_manager.py b/src/person_info/relationship_manager.py index fb0f04cca..fce928231 100644 --- a/src/person_info/relationship_manager.py +++ b/src/person_info/relationship_manager.py @@ -13,6 +13,9 @@ from json_repair import repair_json from datetime import datetime from difflib import SequenceMatcher import ast +import jieba +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity logger = get_logger("relation") @@ -119,6 +122,8 @@ class RelationshipManager: person_id = person_info_manager.get_person_id(person[0], person[1]) person_name = await person_info_manager.get_value(person_id, "person_name") + if not person_name or person_name == "none": + return "" impression = await person_info_manager.get_value(person_id, "impression") interaction = await person_info_manager.get_value(person_id, "interaction") points = await person_info_manager.get_value(person_id, "points") or [] @@ -324,8 +329,8 @@ class RelationshipManager: # 在现有points中查找相似的点 for i, existing_point in enumerate(current_points): - similarity = SequenceMatcher(None, new_point[0], existing_point[0]).ratio() - if similarity > 0.8: + # 使用组合的相似度检查方法 + if self.check_similarity(new_point[0], existing_point[0]): similar_points.append(existing_point) similar_indices.append(i) @@ -355,7 +360,7 @@ class RelationshipManager: current_points = points_list # 如果points超过30条,按权重随机选择多余的条目移动到forgotten_points - if len(current_points) > 5: + if len(current_points) > 10: # 获取现有forgotten_points forgotten_points = await person_info_manager.get_value(person_id, "forgotten_points") or [] if isinstance(forgotten_points, str): @@ -576,5 +581,56 @@ class RelationshipManager: self.logger.error(f"计算时间权重失败: {e}") return 0.5 # 发生错误时返回中等权重 + def tfidf_similarity(self, s1, s2): + """ + 使用 TF-IDF 和余弦相似度计算两个句子的相似性。 + """ + # 1. 使用 jieba 进行分词 + s1_words = " ".join(jieba.cut(s1)) + s2_words = " ".join(jieba.cut(s2)) + + # 2. 将两句话放入一个列表中 + corpus = [s1_words, s2_words] + + # 3. 创建 TF-IDF 向量化器并进行计算 + try: + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform(corpus) + except ValueError: + # 如果句子完全由停用词组成,或者为空,可能会报错 + return 0.0 + + # 4. 计算余弦相似度 + similarity_matrix = cosine_similarity(tfidf_matrix) + + # 返回 s1 和 s2 的相似度 + return similarity_matrix[0, 1] + + def sequence_similarity(self, s1, s2): + """ + 使用 SequenceMatcher 计算两个句子的相似性。 + """ + return SequenceMatcher(None, s1, s2).ratio() + + def check_similarity(self, text1, text2, tfidf_threshold=0.5, seq_threshold=0.6): + """ + 使用两种方法检查文本相似度,只要其中一种方法达到阈值就认为是相似的。 + + Args: + text1: 第一个文本 + text2: 第二个文本 + tfidf_threshold: TF-IDF相似度阈值 + seq_threshold: SequenceMatcher相似度阈值 + + Returns: + bool: 如果任一方法达到阈值则返回True + """ + # 计算两种相似度 + tfidf_sim = self.tfidf_similarity(text1, text2) + seq_sim = self.sequence_similarity(text1, text2) + + # 只要其中一种方法达到阈值就认为是相似的 + return tfidf_sim > tfidf_threshold or seq_sim > seq_threshold + relationship_manager = RelationshipManager() diff --git a/tests/test_sentence_similarity.py b/tests/test_sentence_similarity.py new file mode 100644 index 000000000..c5e46751f --- /dev/null +++ b/tests/test_sentence_similarity.py @@ -0,0 +1,156 @@ +import time +import unittest +import jieba +from difflib import SequenceMatcher +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +def tfidf_similarity(s1, s2): + """ + 使用 TF-IDF 和余弦相似度计算两个句子的相似性。 + """ + # 1. 使用 jieba 进行分词 + s1_words = " ".join(jieba.cut(s1)) + s2_words = " ".join(jieba.cut(s2)) + + # 2. 将两句话放入一个列表中 + corpus = [s1_words, s2_words] + + # 3. 创建 TF-IDF 向量化器并进行计算 + try: + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform(corpus) + except ValueError: + # 如果句子完全由停用词组成,或者为空,可能会报错 + return 0.0 + + # 4. 计算余弦相似度 + similarity_matrix = cosine_similarity(tfidf_matrix) + + # 返回 s1 和 s2 的相似度 + return similarity_matrix[0, 1] + +def sequence_similarity(s1, s2): + """ + 使用 SequenceMatcher 计算两个句子的相似性。 + """ + return SequenceMatcher(None, s1, s2).ratio() + +class TestSentenceSimilarity(unittest.TestCase): + def test_similarity_comparison(self): + """比较不同相似度计算方法的结果""" + test_cases = [ + { + "sentence1": "今天天气怎么样", + "sentence2": "今天气候如何", + "expected_similar": True + }, + { + "sentence1": "今天天气怎么样", + "sentence2": "我今天要去吃麦当劳", + "expected_similar": False + }, + { + "sentence1": "我今天要去吃麦当劳", + "sentence2": "肯德基和麦当劳哪家好吃", + "expected_similar": True + }, + { + "sentence1": "Vindemiatrix提到昨天三个无赖杀穿交界地", + "sentence2": "Vindemiatrix昨天用三个无赖角色杀穿了游戏中的交界地", + "expected_similar": True + }, + { + "sentence1": "tc_魔法士解释了之前templateinfo的with用法和现在的单独逻辑发送的区别", + "sentence2": "tc_魔法士解释了templateinfo的用法,包括它是一个字典,key是prompt的名字,value是prompt的内容,格式是只支持大括号的fstring", + "expected_similar": False + }, + { + "sentence1": "YXH_XianYu分享了一张舰娘街机游戏的图片,并提到'玩舰娘街机的董不懂'", + "sentence2": "YXH_XianYu对街机游戏表现出兴趣,并分享了玩舰娘街机的经历", + "expected_similar": True + }, + { + "sentence1": "YXH_XianYu在考虑入坑明日方舟,犹豫是否要从零开荒或使用初始号", + "sentence2": "YXH_XianYu考虑入坑明日方舟,倾向于从零开荒或初始号开荒", + "expected_similar": True + }, + { + "sentence1": "YXH_XianYu提到秋叶原好多人在玩maimai", + "sentence2": "YXH_XianYu对学园偶像的付费石头机制表示惊讶", + "expected_similar": False + } + ] + + print("\n相似度计算方法比较:") + for i, case in enumerate(test_cases, 1): + print(f"\n测试用例 {i}:") + print(f"句子1: {case['sentence1']}") + print(f"句子2: {case['sentence2']}") + + # TF-IDF 相似度 + start_time = time.time() + tfidf_sim = tfidf_similarity(case['sentence1'], case['sentence2']) + tfidf_time = time.time() - start_time + + # SequenceMatcher 相似度 + start_time = time.time() + seq_sim = sequence_similarity(case['sentence1'], case['sentence2']) + seq_time = time.time() - start_time + + print(f"TF-IDF相似度: {tfidf_sim:.4f} (耗时: {tfidf_time:.4f}秒)") + print(f"SequenceMatcher相似度: {seq_sim:.4f} (耗时: {seq_time:.4f}秒)") + + def test_batch_processing(self): + """测试批量处理性能""" + sentences = [ + "人工智能正在改变世界", + "AI技术发展迅速", + "机器学习是人工智能的一个分支", + "深度学习在图像识别领域取得了突破", + "自然语言处理技术越来越成熟" + ] + + print("\n批量处理测试:") + + # TF-IDF 批量处理 + start_time = time.time() + tfidf_matrix = [] + for i in range(len(sentences)): + row = [] + for j in range(len(sentences)): + similarity = tfidf_similarity(sentences[i], sentences[j]) + row.append(similarity) + tfidf_matrix.append(row) + tfidf_time = time.time() - start_time + + # SequenceMatcher 批量处理 + start_time = time.time() + seq_matrix = [] + for i in range(len(sentences)): + row = [] + for j in range(len(sentences)): + similarity = sequence_similarity(sentences[i], sentences[j]) + row.append(similarity) + seq_matrix.append(row) + seq_time = time.time() - start_time + + print(f"TF-IDF批量处理 {len(sentences)} 个句子耗时: {tfidf_time:.4f}秒") + print(f"SequenceMatcher批量处理 {len(sentences)} 个句子耗时: {seq_time:.4f}秒") + + # 打印TF-IDF相似度矩阵 + print("\nTF-IDF相似度矩阵:") + for row in tfidf_matrix: + for similarity in row: + print(f"{similarity:.4f}", end="\t") + print() + + # 打印SequenceMatcher相似度矩阵 + print("\nSequenceMatcher相似度矩阵:") + for row in seq_matrix: + for similarity in row: + print(f"{similarity:.4f}", end="\t") + print() + +if __name__ == '__main__': + unittest.main(verbosity=2) \ No newline at end of file