diff --git a/configs/config.py b/configs/config.py new file mode 100644 index 0000000..0c64404 --- /dev/null +++ b/configs/config.py @@ -0,0 +1,14 @@ +kbs_config = { + "faiss": { + }, + "milvus": { + "host": "127.0.0.1", + "port": "19530", + "user": "", + "password": "", + "secure": False, + }, + "pg": { + "connection_uri": "postgresql://postgres:postgres@192.168.50.128:5432/langchain_chatgml", + } +} diff --git a/configs/model_config.py.example b/configs/model_config.py.example index 7f97a7e..f4744b7 100644 --- a/configs/model_config.py.example +++ b/configs/model_config.py.example @@ -149,17 +149,7 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search" # 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG BING_SUBSCRIPTION_KEY = "" -kbs_config = { - "faiss": { - }, - "milvus": { - "host": "127.0.0.1", - "port": "19530", - "user": "", - "password": "", - "secure": False, - } -} + # 是否开启中文标题加强,以及标题增强的相关配置 # 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记; diff --git a/docs/docker/vector_db/milvus/docker-compose.yml b/docs/docker/vector_db/milvus/docker-compose.yml new file mode 100644 index 0000000..190f3f8 --- /dev/null +++ b/docs/docker/vector_db/milvus/docker-compose.yml @@ -0,0 +1,49 @@ +version: '3.5' + +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.0 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2022-03-17T06-34-49Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.1.3 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +networks: + default: + name: milvus diff --git a/docs/docker/vector_db/pg/docker-compose.yml b/docs/docker/vector_db/pg/docker-compose.yml new file mode 100644 index 0000000..8e8359c --- /dev/null +++ b/docs/docker/vector_db/pg/docker-compose.yml @@ -0,0 +1,13 @@ +version: "3.8" +services: + postgresql: + image: ankane/pgvector:v0.4.1 + container_name: langchain-chatgml-pg-db + environment: + POSTGRES_DB: langchain_chatgml + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + ports: + - 5432:5432 + volumes: + - ./data:/var/lib/postgresql/data \ No newline at end of file diff --git a/docs/向量库环境docker.md b/docs/向量库环境docker.md new file mode 100644 index 0000000..b06bdb9 --- /dev/null +++ b/docs/向量库环境docker.md @@ -0,0 +1,7 @@ +向量库环境docker-compose.yml文件在docs/docker/vector_db中 + +以milvus为例 +```shell +cd docs/docker/vector_db/milvus +docker-compose up -d +``` diff --git a/requirements.txt b/requirements.txt index 44e4230..c187f88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,5 @@ faiss-cpu pymilvus==2.1.3 # requires milvus==2.1.3 SQLAlchemy==2.0.19 +# psycopg2 +# pgvector \ No newline at end of file diff --git a/server/db/models/knowledge_base_model.py b/server/db/models/knowledge_base_model.py index b59d9a6..37abd4e 100644 --- a/server/db/models/knowledge_base_model.py +++ b/server/db/models/knowledge_base_model.py @@ -1,6 +1,6 @@ -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, Integer, String, DateTime, func + from server.db.base import Base -from datetime import datetime class KnowledgeBaseModel(Base): @@ -12,8 +12,8 @@ class KnowledgeBaseModel(Base): kb_name = Column(String, comment='知识库名称') vs_type = Column(String, comment='嵌入模型类型') embed_model = Column(String, comment='嵌入模型名称') - file_count = Column(Integer, comment='文件数量', default=0) - create_time = Column(DateTime, comment='创建时间', default=datetime.now) + file_count = Column(Integer, default=0, comment='文件数量') + create_time = Column(DateTime, default=func.now(), comment='创建时间') def __repr__(self): return f"" diff --git a/server/db/models/knowledge_file_model.py b/server/db/models/knowledge_file_model.py index 43aba11..7fffdfb 100644 --- a/server/db/models/knowledge_file_model.py +++ b/server/db/models/knowledge_file_model.py @@ -1,6 +1,7 @@ -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, Integer, String, DateTime, func + from server.db.base import Base -from datetime import datetime + class KnowledgeFileModel(Base): """ @@ -13,8 +14,8 @@ class KnowledgeFileModel(Base): kb_name = Column(String, comment='所属知识库名称') document_loader_name = Column(String, comment='文档加载器名称') text_splitter_name = Column(String, comment='文本分割器名称') - file_version = Column(Integer, comment='文件版本', default=1) - create_time = Column(DateTime, comment='创建时间', default=datetime.now) + file_version = Column(Integer, default=1, comment='文件版本') + create_time = Column(DateTime, default=func.now(), comment='创建时间') def __repr__(self): return f"" diff --git a/server/db/repository/knowledge_file_repository.py b/server/db/repository/knowledge_file_repository.py index 9aa00b7..37197ce 100644 --- a/server/db/repository/knowledge_file_repository.py +++ b/server/db/repository/knowledge_file_repository.py @@ -1,3 +1,5 @@ +import datetime + from server.db.models.knowledge_base_model import KnowledgeBaseModel from server.db.models.knowledge_file_model import KnowledgeFileModel from server.db.session import with_session @@ -20,19 +22,13 @@ def add_doc_to_db(session, kb_file: KnowledgeFile): kb_name=kb_file.kb_name).first() if existing_file: existing_file.file_version += 1 - session.add(existing_file) # 否则,添加新文件 else: - new_file = KnowledgeFileModel( - file_name=kb_file.filename, - file_ext=kb_file.ext, - kb_name=kb_file.kb_name, - document_loader_name=kb_file.document_loader_name, - text_splitter_name=kb_file.text_splitter_name, - ) - kb.file_count += 1 - session.add(new_file) - session.add(kb) + session.add(KnowledgeFileModel(file_name=kb_file.filename, + file_ext=kb_file.ext, + document_loader_name=kb_file.document_loader_name, + text_splitter_name=kb_file.text_splitter_name + )) return True diff --git a/server/knowledge_base/kb_service/base.py b/server/knowledge_base/kb_service/base.py index 7209dca..48ca142 100644 --- a/server/knowledge_base/kb_service/base.py +++ b/server/knowledge_base/kb_service/base.py @@ -5,10 +5,11 @@ import os from langchain.embeddings.base import Embeddings from langchain.docstore.document import Document +from configs.config import kbs_config from server.db.repository.knowledge_base_repository import add_kb_to_db, delete_kb_from_db, list_kbs_from_db, kb_exists, load_kb_from_db from server.db.repository.knowledge_file_repository import add_doc_to_db, delete_file_from_db, doc_exists, \ list_docs_from_db -from configs.model_config import (DB_ROOT_PATH, kbs_config, VECTOR_SEARCH_TOP_K, +from configs.model_config import (DB_ROOT_PATH, VECTOR_SEARCH_TOP_K, embedding_model_dict, EMBEDDING_DEVICE, EMBEDDING_MODEL) from server.knowledge_base.utils import (get_kb_path, get_doc_path, load_embeddings, KnowledgeFile) from typing import List, Union @@ -18,6 +19,7 @@ class SupportedVSType: FAISS = 'faiss' MILVUS = 'milvus' DEFAULT = 'default' + PG = 'pg' class KBService(ABC): @@ -189,6 +191,9 @@ class KBServiceFactory: if SupportedVSType.FAISS == vector_store_type: from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService return FaissKBService(kb_name, embed_model=embed_model) + if SupportedVSType.PG == vector_store_type: + from server.knowledge_base.kb_service.pg_kb_service import PGKBService + return PGKBService(kb_name, embed_model=embed_model) elif SupportedVSType.MILVUS == vector_store_type: from server.knowledge_base.kb_service.milvus_kb_service import MilvusKBService return MilvusKBService(kb_name, embed_model=embed_model) # other milvus parameters are set in model_config.kbs_config diff --git a/server/knowledge_base/kb_service/pg_kb_service.py b/server/knowledge_base/kb_service/pg_kb_service.py new file mode 100644 index 0000000..8c5df9c --- /dev/null +++ b/server/knowledge_base/kb_service/pg_kb_service.py @@ -0,0 +1,85 @@ +from typing import List + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import PGVector +from sqlalchemy import text + +from configs.config import kbs_config +from configs.model_config import EMBEDDING_DEVICE +from server.knowledge_base.kb_service.base import KBService, load_embeddings, SupportedVSType +from server.knowledge_base.utils import KnowledgeFile + + +class PGKBService(KBService): + pg_vector: PGVector + + def _load_pg_vector(self, embedding_device: str = EMBEDDING_DEVICE, embeddings: Embeddings = None): + _embeddings = embeddings + if _embeddings is None: + _embeddings = load_embeddings(self.embed_model, embedding_device) + self.pg_vector = PGVector(embedding_function=_embeddings, + collection_name=self.kb_name, + connection_string=kbs_config.get("pg").get("connection_uri")) + + def do_init(self): + self._load_pg_vector() + + def do_create_kb(self): + pass + + def vs_type(self) -> str: + return SupportedVSType.PG + + def do_drop_kb(self): + with self.pg_vector.connect() as connect: + connect.execute(text(f''' + -- 删除 langchain_pg_embedding 表中关联到 langchain_pg_collection 表中 的记录 + DELETE FROM langchain_pg_embedding + WHERE collection_id IN ( + SELECT uuid FROM langchain_pg_collection WHERE name = '{self.kb_name}' + ); + -- 删除 langchain_pg_collection 表中 记录 + DELETE FROM langchain_pg_collection WHERE name = '{self.kb_name}'; + ''')) + connect.commit() + + def do_search(self, query: str, top_k: int, embeddings: Embeddings) -> List[Document]: + self._load_pg_vector(embeddings=embeddings) + return self.pg_vector.similarity_search(query, top_k) + + def add_doc(self, kb_file: KnowledgeFile): + """ + 向知识库添加文件 + """ + docs = kb_file.file2text() + self.pg_vector.add_documents(docs) + from server.db.repository.knowledge_file_repository import add_doc_to_db + status = add_doc_to_db(kb_file) + return status + + def do_add_doc(self, docs: List[Document], embeddings: Embeddings): + pass + + def do_delete_doc(self, kb_file: KnowledgeFile): + with self.pg_vector.connect() as connect: + filepath = kb_file.filepath.replace('\\', '\\\\') + connect.execute( + text( + ''' DELETE FROM langchain_pg_embedding WHERE cmetadata::jsonb @> '{"source": "filepath"}'::jsonb;'''.replace( + "filepath", filepath))) + connect.commit() + + def do_clear_vs(self): + self.pg_vector.delete_collection() + + +if __name__ == '__main__': + from server.db.base import Base, engine + Base.metadata.create_all(bind=engine) + pGKBService = PGKBService("test") + pGKBService.create_kb() + pGKBService.add_doc(KnowledgeFile("test.pdf", "test")) + pGKBService.delete_doc(KnowledgeFile("test.pdf", "test")) + pGKBService.drop_kb() + print(pGKBService.search_docs("测试"))