feat(core): 集成统一向量数据库服务并重构相关模块

本次提交引入了一个统一的、可扩展的向量数据库服务层，旨在解决代码重复、实现分散以及数据库实例泛滥的问题。主要变更: 新增向量数据库抽象层: 在 src/common/vector_db/ 目录下创建了 VectorDBBase 抽象基类，定义了标准化的数据库操作接口。创建了 ChromaDBImpl 作为具体的实现，并采用单例模式确保全局只有一个数据库客户端实例。重构语义缓存 (CacheManager): 移除了对 chromadb 库的直接依赖。改为调用统一的 vector_db_service 来进行向量的添加和查询操作。重构瞬时记忆 (VectorInstantMemoryV2): 彻底解决了为每个 chat_id 创建独立数据库实例的问题。现在所有记忆数据都存储在统一的 instant_memory 集合中，并通过 metadata 中的 chat_id 进行数据隔离和查询。新增使用文档: 在 docs/ 目录下添加了 vector_db_usage_guide.md，详细说明了如何使用新的 vector_db_service 代码接口。带来的好处: 高内聚，低耦合: 业务代码与具体的向量数据库实现解耦。易于维护和扩展: 未来可以轻松替换或添加新的向量数据库支持。性能与资源优化: 整个应用共享一个数据库连接，显著减少了文件句柄和内存占用
2025-08-27 19:18:28 +08:00
parent 27dfc32fdf
commit 4ced72010b
8 changed files with 488 additions and 99 deletions
--- a/src/common/vector_db/init.py
+++ b/src/common/vector_db/init.py
@@ -0,0 +1,19 @@
+from .base import VectorDBBase
+from .chromadb_impl import ChromaDBImpl
+
+def get_vector_db_service() -> VectorDBBase:
+    """
+    工厂函数，初始化并返回向量数据库服务实例。
+    
+    目前硬编码为 ChromaDB，未来可以从配置中读取。
+    """
+    # TODO: 从全局配置中读取数据库类型和路径
+    db_path = "data/chroma_db"
+    
+    # ChromaDBImpl 是一个单例，所以这里每次调用都会返回同一个实例
+    return ChromaDBImpl(path=db_path)
+
+# 全局向量数据库服务实例
+vector_db_service: VectorDBBase = get_vector_db_service()
+
+__all__ = ["vector_db_service", "VectorDBBase"]
--- a/src/common/vector_db/base.py
+++ b/src/common/vector_db/base.py
@@ -0,0 +1,117 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+
+class VectorDBBase(ABC):
+    """
+    向量数据库的抽象基类 (ABC)，定义了所有向量数据库实现必须遵循的接口。
+    """
+
+    @abstractmethod
+    def __init__(self, path: str, **kwargs: Any):
+        """
+        初始化向量数据库客户端。
+
+        Args:
+            path (str): 数据库文件的存储路径。
+            **kwargs: 其他特定于实现的参数。
+        """
+        pass
+
+    @abstractmethod
+    def get_or_create_collection(self, name: str, **kwargs: Any) -> Any:
+        """
+        获取或创建一个集合 (Collection)。
+
+        Args:
+            name (str): 集合的名称。
+            **kwargs: 其他特定于实现的参数 (例如 metadata)。
+
+        Returns:
+            Any: 代表集合的对象。
+        """
+        pass
+
+    @abstractmethod
+    def add(
+        self,
+        collection_name: str,
+        embeddings: List[List[float]],
+        documents: Optional[List[str]] = None,
+        metadatas: Optional[List[Dict[str, Any]]] = None,
+        ids: Optional[List[str]] = None,
+    ) -> None:
+        """
+        向指定集合中添加数据。
+
+        Args:
+            collection_name (str): 目标集合的名称。
+            embeddings (List[List[float]]): 向量列表。
+            documents (Optional[List[str]], optional): 文档列表。Defaults to None.
+            metadatas (Optional[List[Dict[str, Any]]], optional): 元数据列表。Defaults to None.
+            ids (Optional[List[str]], optional): ID 列表。Defaults to None.
+        """
+        pass
+
+    @abstractmethod
+    def query(
+        self,
+        collection_name: str,
+        query_embeddings: List[List[float]],
+        n_results: int = 1,
+        where: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> Dict[str, List[Any]]:
+        """
+        在指定集合中查询相似向量。
+
+        Args:
+            collection_name (str): 目标集合的名称。
+            query_embeddings (List[List[float]]): 用于查询的向量列表。
+            n_results (int, optional): 返回结果的数量。Defaults to 1.
+            where (Optional[Dict[str, Any]], optional): 元数据过滤条件。Defaults to None.
+            **kwargs: 其他特定于实现的参数。
+
+        Returns:
+            Dict[str, List[Any]]: 查询结果，通常包含 ids, distances, metadatas, documents。
+        """
+        pass
+
+    @abstractmethod
+    def delete(
+        self,
+        collection_name: str,
+        ids: Optional[List[str]] = None,
+        where: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        从指定集合中删除数据。
+
+        Args:
+            collection_name (str): 目标集合的名称。
+            ids (Optional[List[str]], optional): 要删除的条目的 ID 列表。Defaults to None.
+            where (Optional[Dict[str, Any]], optional): 基于元数据的过滤条件。Defaults to None.
+        """
+        pass
+
+    @abstractmethod
+    def count(self, collection_name: str) -> int:
+        """
+        获取指定集合中的条目总数。
+
+        Args:
+            collection_name (str): 目标集合的名称。
+
+        Returns:
+            int: 条目总数。
+        """
+        pass
+        
+    @abstractmethod
+    def delete_collection(self, name: str) -> None:
+        """
+        删除一个集合。
+
+        Args:
+            name (str): 要删除的集合的名称。
+        """
+        pass
--- a/src/common/vector_db/chromadb_impl.py
+++ b/src/common/vector_db/chromadb_impl.py
@@ -0,0 +1,137 @@
+import threading
+from typing import Any, Dict, List, Optional
+
+import chromadb
+from chromadb.config import Settings
+
+from .base import VectorDBBase
+from src.common.logger import get_logger
+
+logger = get_logger("chromadb_impl")
+
+class ChromaDBImpl(VectorDBBase):
+    """
+    ChromaDB 的具体实现，遵循 VectorDBBase 接口。
+    采用单例模式，确保全局只有一个 ChromaDB 客户端实例。
+    """
+    _instance = None
+    _lock = threading.Lock()
+
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            with cls._lock:
+                if not cls._instance:
+                    cls._instance = super(ChromaDBImpl, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self, path: str = "data/chroma_db", **kwargs: Any):
+        """
+        初始化 ChromaDB 客户端。
+        由于是单例，这个初始化只会执行一次。
+        """
+        if not hasattr(self, '_initialized'):
+            with self._lock:
+                if not hasattr(self, '_initialized'):
+                    try:
+                        self.client = chromadb.PersistentClient(
+                            path=path,
+                            settings=Settings(anonymized_telemetry=False)
+                        )
+                        self._collections: Dict[str, Any] = {}
+                        self._initialized = True
+                        logger.info(f"ChromaDB 客户端已初始化，数据库路径: {path}")
+                    except Exception as e:
+                        logger.error(f"ChromaDB 初始化失败: {e}")
+                        self.client = None
+                        self._initialized = False
+
+    def get_or_create_collection(self, name: str, **kwargs: Any) -> Any:
+        if not self.client:
+            raise ConnectionError("ChromaDB 客户端未初始化")
+            
+        if name in self._collections:
+            return self._collections[name]
+        
+        try:
+            collection = self.client.get_or_create_collection(name=name, **kwargs)
+            self._collections[name] = collection
+            logger.info(f"成功获取或创建集合: '{name}'")
+            return collection
+        except Exception as e:
+            logger.error(f"获取或创建集合 '{name}' 失败: {e}")
+            return None
+
+    def add(
+        self,
+        collection_name: str,
+        embeddings: List[List[float]],
+        documents: Optional[List[str]] = None,
+        metadatas: Optional[List[Dict[str, Any]]] = None,
+        ids: Optional[List[str]] = None,
+    ) -> None:
+        collection = self.get_or_create_collection(collection_name)
+        if collection:
+            try:
+                collection.add(
+                    embeddings=embeddings,
+                    documents=documents,
+                    metadatas=metadatas,
+                    ids=ids,
+                )
+            except Exception as e:
+                logger.error(f"向集合 '{collection_name}' 添加数据失败: {e}")
+
+    def query(
+        self,
+        collection_name: str,
+        query_embeddings: List[List[float]],
+        n_results: int = 1,
+        where: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> Dict[str, List[Any]]:
+        collection = self.get_or_create_collection(collection_name)
+        if collection:
+            try:
+                return collection.query(
+                    query_embeddings=query_embeddings,
+                    n_results=n_results,
+                    where=where or {},
+                    **kwargs,
+                )
+            except Exception as e:
+                logger.error(f"查询集合 '{collection_name}' 失败: {e}")
+        return {}
+
+    def delete(
+        self,
+        collection_name: str,
+        ids: Optional[List[str]] = None,
+        where: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        collection = self.get_or_create_collection(collection_name)
+        if collection:
+            try:
+                collection.delete(ids=ids, where=where)
+            except Exception as e:
+                logger.error(f"从集合 '{collection_name}' 删除数据失败: {e}")
+
+    def count(self, collection_name: str) -> int:
+        collection = self.get_or_create_collection(collection_name)
+        if collection:
+            try:
+                return collection.count()
+            except Exception as e:
+                logger.error(f"获取集合 '{collection_name}' 计数失败: {e}")
+        return 0
+        
+    def delete_collection(self, name: str) -> None:
+        if not self.client:
+            raise ConnectionError("ChromaDB 客户端未初始化")
+        
+        try:
+            self.client.delete_collection(name=name)
+            if name in self._collections:
+                del self._collections[name]
+            logger.info(f"集合 '{name}' 已被删除")
+        except Exception as e:
+            logger.error(f"删除集合 '{name}' 失败: {e}")