better：优化interest的算法，更好更强

2025-07-25 15:45:14 +08:00
parent 2aec68bd3d
commit 8c9b2b54c0
4 changed files with 511 additions and 75 deletions
--- a/src/chat/memory_system/Hippocampus.py
+++ b/src/chat/memory_system/Hippocampus.py
@@ -299,6 +299,63 @@ class Hippocampus:
        # 按相似度降序排序
        memories.sort(key=lambda x: x[2], reverse=True)
        return memories
+    
+    async def get_keywords_from_text(self, text: str, fast_retrieval: bool = False) -> list:
+        """从文本中提取关键词。
+        
+        Args:
+            text (str): 输入文本
+            fast_retrieval (bool, optional): 是否使用快速检索。默认为False。
+                如果为True，使用jieba分词提取关键词，速度更快但可能不够准确。
+                如果为False，使用LLM提取关键词，速度较慢但更准确。
+        """
+        if not text:
+            return []
+
+        if fast_retrieval:
+            # 使用jieba分词提取关键词
+            words = jieba.cut(text)
+            # 过滤掉停用词和单字词
+            keywords = [word for word in words if len(word) > 1]
+            # 去重
+            keywords = list(set(keywords))
+            # 限制关键词数量
+            logger.debug(f"提取关键词: {keywords}")
+
+        else:
+            # 使用LLM提取关键词 - 根据详细文本长度分布优化topic_num计算
+            text_length = len(text)
+            if text_length <= 5:
+                topic_num = 1  # 1-5字符: 1个关键词 (26.57%的文本)
+            elif text_length <= 10:
+                topic_num = 1  # 6-10字符: 1个关键词 (27.18%的文本)
+            elif text_length <= 20:
+                topic_num = 2  # 11-20字符: 2个关键词 (22.76%的文本)
+            elif text_length <= 30:
+                topic_num = 3  # 21-30字符: 3个关键词 (10.33%的文本)
+            elif text_length <= 50:
+                topic_num = 4  # 31-50字符: 4个关键词 (9.79%的文本)
+            else:
+                topic_num = 5  # 51+字符: 5个关键词 (其余长文本)
+            
+            # logger.info(f"提取关键词数量: {topic_num}")
+            topics_response, (reasoning_content, model_name) = await self.model_summary.generate_response_async(
+                self.find_topic_llm(text, topic_num)
+            )
+
+            # 提取关键词
+            keywords = re.findall(r"<([^>]+)>", topics_response)
+            if not keywords:
+                keywords = []
+            else:
+                keywords = [
+                    keyword.strip()
+                    for keyword in ",".join(keywords).replace("，", ",").replace("、", ",").replace(" ", ",").split(",")
+                    if keyword.strip()
+                ]
+            
+            return keywords 
+        

    async def get_memory_from_text(
        self,
@@ -325,39 +382,7 @@ class Hippocampus:
                - memory_items: list, 该主题下的记忆项列表
                - similarity: float, 与文本的相似度
        """
-        if not text:
-            return []
-
-        if fast_retrieval:
-            # 使用jieba分词提取关键词
-            words = jieba.cut(text)
-            # 过滤掉停用词和单字词
-            keywords = [word for word in words if len(word) > 1]
-            # 去重
-            keywords = list(set(keywords))
-            # 限制关键词数量
-            logger.debug(f"提取关键词: {keywords}")
-
-        else:
-            # 使用LLM提取关键词
-            topic_num = min(5, max(1, int(len(text) * 0.1)))  # 根据文本长度动态调整关键词数量
-            # logger.info(f"提取关键词数量: {topic_num}")
-            topics_response, (reasoning_content, model_name) = await self.model_summary.generate_response_async(
-                self.find_topic_llm(text, topic_num)
-            )
-
-            # 提取关键词
-            keywords = re.findall(r"<([^>]+)>", topics_response)
-            if not keywords:
-                keywords = []
-            else:
-                keywords = [
-                    keyword.strip()
-                    for keyword in ",".join(keywords).replace("，", ",").replace("、", ",").replace(" ", ",").split(",")
-                    if keyword.strip()
-                ]
-
-        # logger.info(f"提取的关键词: {', '.join(keywords)}")
+        keywords = await self.get_keywords_from_text(text, fast_retrieval)

        # 过滤掉不存在于记忆图中的关键词
        valid_keywords = [keyword for keyword in keywords if keyword in self.memory_graph.G]
@@ -679,38 +704,7 @@ class Hippocampus:
        Returns:
            float: 激活节点数与总节点数的比值
        """
-        if not text:
-            return 0
-
-        if fast_retrieval:
-            # 使用jieba分词提取关键词
-            words = jieba.cut(text)
-            # 过滤掉停用词和单字词
-            keywords = [word for word in words if len(word) > 1]
-            # 去重
-            keywords = list(set(keywords))
-            # 限制关键词数量
-            keywords = keywords[:5]
-        else:
-            # 使用LLM提取关键词
-            topic_num = min(5, max(1, int(len(text) * 0.1)))  # 根据文本长度动态调整关键词数量
-            # logger.info(f"提取关键词数量: {topic_num}")
-            topics_response, (reasoning_content, model_name) = await self.model_summary.generate_response_async(
-                self.find_topic_llm(text, topic_num)
-            )
-
-            # 提取关键词
-            keywords = re.findall(r"<([^>]+)>", topics_response)
-            if not keywords:
-                keywords = []
-            else:
-                keywords = [
-                    keyword.strip()
-                    for keyword in ",".join(keywords).replace("，", ",").replace("、", ",").replace(" ", ",").split(",")
-                    if keyword.strip()
-                ]
-
-        # logger.info(f"提取的关键词: {', '.join(keywords)}")
+        keywords = await self.get_keywords_from_text(text, fast_retrieval)

        # 过滤掉不存在于记忆图中的关键词
        valid_keywords = [keyword for keyword in keywords if keyword in self.memory_graph.G]