Merge branch 'pr1037_pg_vs' into dev_fastchat

This commit is contained in:
liunux4odoo 2023-08-10 15:07:58 +08:00
commit 0364215987
10 changed files with 171 additions and 10 deletions

View File

@ -158,6 +158,9 @@ kbs_config = {
"user": "",
"password": "",
"secure": False,
},
"pg": {
"connection_uri": "postgresql://postgres:postgres@192.168.50.128:5432/langchain_chatgml",
}
}

View File

@ -0,0 +1,49 @@
version: '3.5'
services:
etcd:
container_name: milvus-etcd
image: quay.io/coreos/etcd:v3.5.0
environment:
- ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000
- ETCD_QUOTA_BACKEND_BYTES=4294967296
- ETCD_SNAPSHOT_COUNT=50000
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
minio:
container_name: milvus-minio
image: minio/minio:RELEASE.2022-03-17T06-34-49Z
environment:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
command: minio server /minio_data
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
standalone:
container_name: milvus-standalone
image: milvusdb/milvus:v2.1.3
command: ["milvus", "run", "standalone"]
environment:
ETCD_ENDPOINTS: etcd:2379
MINIO_ADDRESS: minio:9000
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
ports:
- "19530:19530"
- "9091:9091"
depends_on:
- "etcd"
- "minio"
networks:
default:
name: milvus

View File

@ -0,0 +1,13 @@
version: "3.8"
services:
postgresql:
image: ankane/pgvector:v0.4.1
container_name: langchain-chatgml-pg-db
environment:
POSTGRES_DB: langchain_chatgml
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
ports:
- 5432:5432
volumes:
- ./data:/var/lib/postgresql/data

View File

@ -0,0 +1,7 @@
向量库环境docker-compose.yml文件在docs/docker/vector_db中
以milvus为例
```shell
cd docs/docker/vector_db/milvus
docker-compose up -d
```

View File

@ -28,3 +28,5 @@ faiss-cpu
pymilvus==2.1.3 # requires milvus==2.1.3
SQLAlchemy==2.0.19
# psycopg2
# pgvector

View File

@ -1,6 +1,6 @@
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy import Column, Integer, String, DateTime, func
from server.db.base import Base
from datetime import datetime
class KnowledgeBaseModel(Base):
@ -12,8 +12,8 @@ class KnowledgeBaseModel(Base):
kb_name = Column(String, comment='知识库名称')
vs_type = Column(String, comment='嵌入模型类型')
embed_model = Column(String, comment='嵌入模型名称')
file_count = Column(Integer, comment='文件数量', default=0)
create_time = Column(DateTime, comment='创建时间', default=datetime.now)
file_count = Column(Integer, default=0, comment='文件数量')
create_time = Column(DateTime, default=func.now(), comment='创建时间')
def __repr__(self):
return f"<KnowledgeBase(id='{self.id}', kb_name='{self.kb_name}', vs_type='{self.vs_type}', embed_model='{self.embed_model}', file_count='{self.file_count}', create_time='{self.create_time}')>"

View File

@ -1,6 +1,7 @@
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy import Column, Integer, String, DateTime, func
from server.db.base import Base
from datetime import datetime
class KnowledgeFileModel(Base):
"""
@ -13,8 +14,8 @@ class KnowledgeFileModel(Base):
kb_name = Column(String, comment='所属知识库名称')
document_loader_name = Column(String, comment='文档加载器名称')
text_splitter_name = Column(String, comment='文本分割器名称')
file_version = Column(Integer, comment='文件版本', default=1)
create_time = Column(DateTime, comment='创建时间', default=datetime.now)
file_version = Column(Integer, default=1, comment='文件版本')
create_time = Column(DateTime, default=func.now(), comment='创建时间')
def __repr__(self):
return f"<KnowledgeFile(id='{self.id}', file_name='{self.file_name}', file_ext='{self.file_ext}', kb_name='{self.kb_name}', document_loader_name='{self.document_loader_name}', text_splitter_name='{self.text_splitter_name}', file_version='{self.file_version}', create_time='{self.create_time}')>"

View File

@ -20,7 +20,6 @@ def add_doc_to_db(session, kb_file: KnowledgeFile):
kb_name=kb_file.kb_name).first()
if existing_file:
existing_file.file_version += 1
session.add(existing_file)
# 否则,添加新文件
else:
new_file = KnowledgeFileModel(
@ -32,7 +31,6 @@ def add_doc_to_db(session, kb_file: KnowledgeFile):
)
kb.file_count += 1
session.add(new_file)
session.add(kb)
return True

View File

@ -18,6 +18,7 @@ class SupportedVSType:
FAISS = 'faiss'
MILVUS = 'milvus'
DEFAULT = 'default'
PG = 'pg'
class KBService(ABC):
@ -189,6 +190,9 @@ class KBServiceFactory:
if SupportedVSType.FAISS == vector_store_type:
from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
return FaissKBService(kb_name, embed_model=embed_model)
if SupportedVSType.PG == vector_store_type:
from server.knowledge_base.kb_service.pg_kb_service import PGKBService
return PGKBService(kb_name, embed_model=embed_model)
elif SupportedVSType.MILVUS == vector_store_type:
from server.knowledge_base.kb_service.milvus_kb_service import MilvusKBService
return MilvusKBService(kb_name, embed_model=embed_model) # other milvus parameters are set in model_config.kbs_config

View File

@ -0,0 +1,84 @@
from typing import List
from langchain.embeddings.base import Embeddings
from langchain.schema import Document
from langchain.vectorstores import PGVector
from sqlalchemy import text
from configs.model_config import EMBEDDING_DEVICE, kbs_config
from server.knowledge_base.kb_service.base import SupportedVSType
from server.knowledge_base.utils import KBService, load_embeddings, KnowledgeFile
class PGKBService(KBService):
pg_vector: PGVector
def _load_pg_vector(self, embedding_device: str = EMBEDDING_DEVICE, embeddings: Embeddings = None):
_embeddings = embeddings
if _embeddings is None:
_embeddings = load_embeddings(self.embed_model, embedding_device)
self.pg_vector = PGVector(embedding_function=_embeddings,
collection_name=self.kb_name,
connection_string=kbs_config.get("pg").get("connection_uri"))
def do_init(self):
self._load_pg_vector()
def do_create_kb(self):
pass
def vs_type(self) -> str:
return SupportedVSType.PG
def do_drop_kb(self):
with self.pg_vector.connect() as connect:
connect.execute(text(f'''
-- 删除 langchain_pg_embedding 表中关联到 langchain_pg_collection 表中 的记录
DELETE FROM langchain_pg_embedding
WHERE collection_id IN (
SELECT uuid FROM langchain_pg_collection WHERE name = '{self.kb_name}'
);
-- 删除 langchain_pg_collection 表中 记录
DELETE FROM langchain_pg_collection WHERE name = '{self.kb_name}';
'''))
connect.commit()
def do_search(self, query: str, top_k: int, embeddings: Embeddings) -> List[Document]:
self._load_pg_vector(embeddings=embeddings)
return self.pg_vector.similarity_search(query, top_k)
def add_doc(self, kb_file: KnowledgeFile):
"""
向知识库添加文件
"""
docs = kb_file.file2text()
self.pg_vector.add_documents(docs)
from server.db.repository.knowledge_file_repository import add_doc_to_db
status = add_doc_to_db(kb_file)
return status
def do_add_doc(self, docs: List[Document], embeddings: Embeddings):
pass
def do_delete_doc(self, kb_file: KnowledgeFile):
with self.pg_vector.connect() as connect:
filepath = kb_file.filepath.replace('\\', '\\\\')
connect.execute(
text(
''' DELETE FROM langchain_pg_embedding WHERE cmetadata::jsonb @> '{"source": "filepath"}'::jsonb;'''.replace(
"filepath", filepath)))
connect.commit()
def do_clear_vs(self):
self.pg_vector.delete_collection()
if __name__ == '__main__':
from server.db.base import Base, engine
Base.metadata.create_all(bind=engine)
pGKBService = PGKBService("test")
pGKBService.create_kb()
pGKBService.add_doc(KnowledgeFile("test.pdf", "test"))
pGKBService.delete_doc(KnowledgeFile("test.pdf", "test"))
pGKBService.drop_kb()
print(pGKBService.search_docs("测试"))