添加向量数据库pg支持,和向量库docker-compose.ym环境文件
This commit is contained in:
parent
f7b2c8cd04
commit
d4f728dfa2
|
|
@ -0,0 +1,14 @@
|
||||||
|
kbs_config = {
|
||||||
|
"faiss": {
|
||||||
|
},
|
||||||
|
"milvus": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": "19530",
|
||||||
|
"user": "",
|
||||||
|
"password": "",
|
||||||
|
"secure": False,
|
||||||
|
},
|
||||||
|
"pg": {
|
||||||
|
"connection_uri": "postgresql://postgres:postgres@192.168.50.128:5432/langchain_chatgml",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -149,17 +149,7 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
|
||||||
# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
|
# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
|
||||||
BING_SUBSCRIPTION_KEY = ""
|
BING_SUBSCRIPTION_KEY = ""
|
||||||
|
|
||||||
kbs_config = {
|
|
||||||
"faiss": {
|
|
||||||
},
|
|
||||||
"milvus": {
|
|
||||||
"host": "127.0.0.1",
|
|
||||||
"port": "19530",
|
|
||||||
"user": "",
|
|
||||||
"password": "",
|
|
||||||
"secure": False,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# 是否开启中文标题加强,以及标题增强的相关配置
|
# 是否开启中文标题加强,以及标题增强的相关配置
|
||||||
# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记;
|
# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
version: '3.5'
|
||||||
|
|
||||||
|
services:
|
||||||
|
etcd:
|
||||||
|
container_name: milvus-etcd
|
||||||
|
image: quay.io/coreos/etcd:v3.5.0
|
||||||
|
environment:
|
||||||
|
- ETCD_AUTO_COMPACTION_MODE=revision
|
||||||
|
- ETCD_AUTO_COMPACTION_RETENTION=1000
|
||||||
|
- ETCD_QUOTA_BACKEND_BYTES=4294967296
|
||||||
|
- ETCD_SNAPSHOT_COUNT=50000
|
||||||
|
volumes:
|
||||||
|
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
|
||||||
|
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
|
||||||
|
|
||||||
|
minio:
|
||||||
|
container_name: milvus-minio
|
||||||
|
image: minio/minio:RELEASE.2022-03-17T06-34-49Z
|
||||||
|
environment:
|
||||||
|
MINIO_ACCESS_KEY: minioadmin
|
||||||
|
MINIO_SECRET_KEY: minioadmin
|
||||||
|
volumes:
|
||||||
|
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
|
||||||
|
command: minio server /minio_data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 20s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
standalone:
|
||||||
|
container_name: milvus-standalone
|
||||||
|
image: milvusdb/milvus:v2.1.3
|
||||||
|
command: ["milvus", "run", "standalone"]
|
||||||
|
environment:
|
||||||
|
ETCD_ENDPOINTS: etcd:2379
|
||||||
|
MINIO_ADDRESS: minio:9000
|
||||||
|
volumes:
|
||||||
|
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
|
||||||
|
ports:
|
||||||
|
- "19530:19530"
|
||||||
|
- "9091:9091"
|
||||||
|
depends_on:
|
||||||
|
- "etcd"
|
||||||
|
- "minio"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
name: milvus
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
version: "3.8"
|
||||||
|
services:
|
||||||
|
postgresql:
|
||||||
|
image: ankane/pgvector:v0.4.1
|
||||||
|
container_name: langchain-chatgml-pg-db
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: langchain_chatgml
|
||||||
|
POSTGRES_USER: postgres
|
||||||
|
POSTGRES_PASSWORD: postgres
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
volumes:
|
||||||
|
- ./data:/var/lib/postgresql/data
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
向量库环境docker-compose.yml文件在docs/docker/vector_db中
|
||||||
|
|
||||||
|
以milvus为例
|
||||||
|
```shell
|
||||||
|
cd docs/docker/vector_db/milvus
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
@ -28,3 +28,5 @@ faiss-cpu
|
||||||
pymilvus==2.1.3 # requires milvus==2.1.3
|
pymilvus==2.1.3 # requires milvus==2.1.3
|
||||||
|
|
||||||
SQLAlchemy==2.0.19
|
SQLAlchemy==2.0.19
|
||||||
|
# psycopg2
|
||||||
|
# pgvector
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from sqlalchemy import Column, Integer, String, DateTime
|
from sqlalchemy import Column, Integer, String, DateTime, func
|
||||||
|
|
||||||
from server.db.base import Base
|
from server.db.base import Base
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeBaseModel(Base):
|
class KnowledgeBaseModel(Base):
|
||||||
|
|
@ -12,8 +12,8 @@ class KnowledgeBaseModel(Base):
|
||||||
kb_name = Column(String, comment='知识库名称')
|
kb_name = Column(String, comment='知识库名称')
|
||||||
vs_type = Column(String, comment='嵌入模型类型')
|
vs_type = Column(String, comment='嵌入模型类型')
|
||||||
embed_model = Column(String, comment='嵌入模型名称')
|
embed_model = Column(String, comment='嵌入模型名称')
|
||||||
file_count = Column(Integer, comment='文件数量', default=0)
|
file_count = Column(Integer, default=0, comment='文件数量')
|
||||||
create_time = Column(DateTime, comment='创建时间', default=datetime.now)
|
create_time = Column(DateTime, default=func.now(), comment='创建时间')
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<KnowledgeBase(id='{self.id}', kb_name='{self.kb_name}', vs_type='{self.vs_type}', embed_model='{self.embed_model}', file_count='{self.file_count}', create_time='{self.create_time}')>"
|
return f"<KnowledgeBase(id='{self.id}', kb_name='{self.kb_name}', vs_type='{self.vs_type}', embed_model='{self.embed_model}', file_count='{self.file_count}', create_time='{self.create_time}')>"
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
from sqlalchemy import Column, Integer, String, DateTime
|
from sqlalchemy import Column, Integer, String, DateTime, func
|
||||||
|
|
||||||
from server.db.base import Base
|
from server.db.base import Base
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
class KnowledgeFileModel(Base):
|
class KnowledgeFileModel(Base):
|
||||||
"""
|
"""
|
||||||
|
|
@ -13,8 +14,8 @@ class KnowledgeFileModel(Base):
|
||||||
kb_name = Column(String, comment='所属知识库名称')
|
kb_name = Column(String, comment='所属知识库名称')
|
||||||
document_loader_name = Column(String, comment='文档加载器名称')
|
document_loader_name = Column(String, comment='文档加载器名称')
|
||||||
text_splitter_name = Column(String, comment='文本分割器名称')
|
text_splitter_name = Column(String, comment='文本分割器名称')
|
||||||
file_version = Column(Integer, comment='文件版本', default=1)
|
file_version = Column(Integer, default=1, comment='文件版本')
|
||||||
create_time = Column(DateTime, comment='创建时间', default=datetime.now)
|
create_time = Column(DateTime, default=func.now(), comment='创建时间')
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"<KnowledgeFile(id='{self.id}', file_name='{self.file_name}', file_ext='{self.file_ext}', kb_name='{self.kb_name}', document_loader_name='{self.document_loader_name}', text_splitter_name='{self.text_splitter_name}', file_version='{self.file_version}', create_time='{self.create_time}')>"
|
return f"<KnowledgeFile(id='{self.id}', file_name='{self.file_name}', file_ext='{self.file_ext}', kb_name='{self.kb_name}', document_loader_name='{self.document_loader_name}', text_splitter_name='{self.text_splitter_name}', file_version='{self.file_version}', create_time='{self.create_time}')>"
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
import datetime
|
||||||
|
|
||||||
from server.db.models.knowledge_base_model import KnowledgeBaseModel
|
from server.db.models.knowledge_base_model import KnowledgeBaseModel
|
||||||
from server.db.models.knowledge_file_model import KnowledgeFileModel
|
from server.db.models.knowledge_file_model import KnowledgeFileModel
|
||||||
from server.db.session import with_session
|
from server.db.session import with_session
|
||||||
|
|
@ -20,19 +22,13 @@ def add_doc_to_db(session, kb_file: KnowledgeFile):
|
||||||
kb_name=kb_file.kb_name).first()
|
kb_name=kb_file.kb_name).first()
|
||||||
if existing_file:
|
if existing_file:
|
||||||
existing_file.file_version += 1
|
existing_file.file_version += 1
|
||||||
session.add(existing_file)
|
|
||||||
# 否则,添加新文件
|
# 否则,添加新文件
|
||||||
else:
|
else:
|
||||||
new_file = KnowledgeFileModel(
|
session.add(KnowledgeFileModel(file_name=kb_file.filename,
|
||||||
file_name=kb_file.filename,
|
file_ext=kb_file.ext,
|
||||||
file_ext=kb_file.ext,
|
document_loader_name=kb_file.document_loader_name,
|
||||||
kb_name=kb_file.kb_name,
|
text_splitter_name=kb_file.text_splitter_name
|
||||||
document_loader_name=kb_file.document_loader_name,
|
))
|
||||||
text_splitter_name=kb_file.text_splitter_name,
|
|
||||||
)
|
|
||||||
kb.file_count += 1
|
|
||||||
session.add(new_file)
|
|
||||||
session.add(kb)
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,10 +5,11 @@ import os
|
||||||
from langchain.embeddings.base import Embeddings
|
from langchain.embeddings.base import Embeddings
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
|
|
||||||
|
from configs.config import kbs_config
|
||||||
from server.db.repository.knowledge_base_repository import add_kb_to_db, delete_kb_from_db, list_kbs_from_db, kb_exists, load_kb_from_db
|
from server.db.repository.knowledge_base_repository import add_kb_to_db, delete_kb_from_db, list_kbs_from_db, kb_exists, load_kb_from_db
|
||||||
from server.db.repository.knowledge_file_repository import add_doc_to_db, delete_file_from_db, doc_exists, \
|
from server.db.repository.knowledge_file_repository import add_doc_to_db, delete_file_from_db, doc_exists, \
|
||||||
list_docs_from_db
|
list_docs_from_db
|
||||||
from configs.model_config import (DB_ROOT_PATH, kbs_config, VECTOR_SEARCH_TOP_K,
|
from configs.model_config import (DB_ROOT_PATH, VECTOR_SEARCH_TOP_K,
|
||||||
embedding_model_dict, EMBEDDING_DEVICE, EMBEDDING_MODEL)
|
embedding_model_dict, EMBEDDING_DEVICE, EMBEDDING_MODEL)
|
||||||
from server.knowledge_base.utils import (get_kb_path, get_doc_path, load_embeddings, KnowledgeFile)
|
from server.knowledge_base.utils import (get_kb_path, get_doc_path, load_embeddings, KnowledgeFile)
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
@ -18,6 +19,7 @@ class SupportedVSType:
|
||||||
FAISS = 'faiss'
|
FAISS = 'faiss'
|
||||||
MILVUS = 'milvus'
|
MILVUS = 'milvus'
|
||||||
DEFAULT = 'default'
|
DEFAULT = 'default'
|
||||||
|
PG = 'pg'
|
||||||
|
|
||||||
|
|
||||||
class KBService(ABC):
|
class KBService(ABC):
|
||||||
|
|
@ -189,6 +191,9 @@ class KBServiceFactory:
|
||||||
if SupportedVSType.FAISS == vector_store_type:
|
if SupportedVSType.FAISS == vector_store_type:
|
||||||
from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
|
from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
|
||||||
return FaissKBService(kb_name, embed_model=embed_model)
|
return FaissKBService(kb_name, embed_model=embed_model)
|
||||||
|
if SupportedVSType.PG == vector_store_type:
|
||||||
|
from server.knowledge_base.kb_service.pg_kb_service import PGKBService
|
||||||
|
return PGKBService(kb_name, embed_model=embed_model)
|
||||||
elif SupportedVSType.MILVUS == vector_store_type:
|
elif SupportedVSType.MILVUS == vector_store_type:
|
||||||
from server.knowledge_base.kb_service.milvus_kb_service import MilvusKBService
|
from server.knowledge_base.kb_service.milvus_kb_service import MilvusKBService
|
||||||
return MilvusKBService(kb_name, embed_model=embed_model) # other milvus parameters are set in model_config.kbs_config
|
return MilvusKBService(kb_name, embed_model=embed_model) # other milvus parameters are set in model_config.kbs_config
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.schema import Document
|
||||||
|
from langchain.vectorstores import PGVector
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from configs.config import kbs_config
|
||||||
|
from configs.model_config import EMBEDDING_DEVICE
|
||||||
|
from server.knowledge_base.kb_service.base import KBService, load_embeddings, SupportedVSType
|
||||||
|
from server.knowledge_base.utils import KnowledgeFile
|
||||||
|
|
||||||
|
|
||||||
|
class PGKBService(KBService):
|
||||||
|
pg_vector: PGVector
|
||||||
|
|
||||||
|
def _load_pg_vector(self, embedding_device: str = EMBEDDING_DEVICE, embeddings: Embeddings = None):
|
||||||
|
_embeddings = embeddings
|
||||||
|
if _embeddings is None:
|
||||||
|
_embeddings = load_embeddings(self.embed_model, embedding_device)
|
||||||
|
self.pg_vector = PGVector(embedding_function=_embeddings,
|
||||||
|
collection_name=self.kb_name,
|
||||||
|
connection_string=kbs_config.get("pg").get("connection_uri"))
|
||||||
|
|
||||||
|
def do_init(self):
|
||||||
|
self._load_pg_vector()
|
||||||
|
|
||||||
|
def do_create_kb(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def vs_type(self) -> str:
|
||||||
|
return SupportedVSType.PG
|
||||||
|
|
||||||
|
def do_drop_kb(self):
|
||||||
|
with self.pg_vector.connect() as connect:
|
||||||
|
connect.execute(text(f'''
|
||||||
|
-- 删除 langchain_pg_embedding 表中关联到 langchain_pg_collection 表中 的记录
|
||||||
|
DELETE FROM langchain_pg_embedding
|
||||||
|
WHERE collection_id IN (
|
||||||
|
SELECT uuid FROM langchain_pg_collection WHERE name = '{self.kb_name}'
|
||||||
|
);
|
||||||
|
-- 删除 langchain_pg_collection 表中 记录
|
||||||
|
DELETE FROM langchain_pg_collection WHERE name = '{self.kb_name}';
|
||||||
|
'''))
|
||||||
|
connect.commit()
|
||||||
|
|
||||||
|
def do_search(self, query: str, top_k: int, embeddings: Embeddings) -> List[Document]:
|
||||||
|
self._load_pg_vector(embeddings=embeddings)
|
||||||
|
return self.pg_vector.similarity_search(query, top_k)
|
||||||
|
|
||||||
|
def add_doc(self, kb_file: KnowledgeFile):
|
||||||
|
"""
|
||||||
|
向知识库添加文件
|
||||||
|
"""
|
||||||
|
docs = kb_file.file2text()
|
||||||
|
self.pg_vector.add_documents(docs)
|
||||||
|
from server.db.repository.knowledge_file_repository import add_doc_to_db
|
||||||
|
status = add_doc_to_db(kb_file)
|
||||||
|
return status
|
||||||
|
|
||||||
|
def do_add_doc(self, docs: List[Document], embeddings: Embeddings):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def do_delete_doc(self, kb_file: KnowledgeFile):
|
||||||
|
with self.pg_vector.connect() as connect:
|
||||||
|
filepath = kb_file.filepath.replace('\\', '\\\\')
|
||||||
|
connect.execute(
|
||||||
|
text(
|
||||||
|
''' DELETE FROM langchain_pg_embedding WHERE cmetadata::jsonb @> '{"source": "filepath"}'::jsonb;'''.replace(
|
||||||
|
"filepath", filepath)))
|
||||||
|
connect.commit()
|
||||||
|
|
||||||
|
def do_clear_vs(self):
|
||||||
|
self.pg_vector.delete_collection()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from server.db.base import Base, engine
|
||||||
|
Base.metadata.create_all(bind=engine)
|
||||||
|
pGKBService = PGKBService("test")
|
||||||
|
pGKBService.create_kb()
|
||||||
|
pGKBService.add_doc(KnowledgeFile("test.pdf", "test"))
|
||||||
|
pGKBService.delete_doc(KnowledgeFile("test.pdf", "test"))
|
||||||
|
pGKBService.drop_kb()
|
||||||
|
print(pGKBService.search_docs("测试"))
|
||||||
Loading…
Reference in New Issue