From 6ad8aee88c1b2559d0f30ccc59b28e5cae6d6246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=94=90=E5=9B=BD=E6=A2=81?= Date: Thu, 14 Sep 2023 07:54:42 +0800 Subject: [PATCH 1/2] add ES function --- .DS_Store | Bin 0 -> 6148 bytes docs/.DS_Store | Bin 0 -> 6148 bytes docs/docker/.DS_Store | Bin 0 -> 6148 bytes docs/docker/vector_db/.DS_Store | Bin 0 -> 6148 bytes .../vector_db/elasticsearch/ES部署指南.md | 29 ++++ server/.DS_Store | Bin 0 -> 6148 bytes server/db/.DS_Store | Bin 0 -> 6148 bytes server/knowledge_base/.DS_Store | Bin 0 -> 6148 bytes server/knowledge_base/kb_service/base.py | 4 + .../kb_service/es_kb_service.py | 160 ++++++++++++++++++ 10 files changed, 193 insertions(+) create mode 100644 .DS_Store create mode 100644 docs/.DS_Store create mode 100644 docs/docker/.DS_Store create mode 100644 docs/docker/vector_db/.DS_Store create mode 100644 docs/docker/vector_db/elasticsearch/ES部署指南.md create mode 100644 server/.DS_Store create mode 100644 server/db/.DS_Store create mode 100644 server/knowledge_base/.DS_Store create mode 100644 server/knowledge_base/kb_service/es_kb_service.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a2264a1b2d57c55917f9f7455ae1078ee7c5b910 GIT binary patch literal 6148 zcmeHK!A`+1HX*{+Peklf`Awv;M4vU9!B20eiSCdYW1CM#BfVw2STX z-egics_^6bw8AGR^=gGTs*UNiXzv~zo?f({qUTt=C=wZ-Ia0PY&fpaX3pI`1K^%#= zhY`xMz{iTakoZYA^6=ZqnTG@d7{HjUI>i1s33C;#EOmh!WWvZ=FrfF3U0RE3O_7EH z!@zeKp!0#F5;__)h4Sb?BU=DO3*Az1OuYo4m`X)dsZ6&ROr@ir>vfLC zOrc5#rkf9@-^_G}!sOfW`CKjs<|s7QFkl#%XCSAhWxD^*SNH$(NoHmkFbw=F2AI`y zTTLuU->pl<(Ot_??@>u;U8Yc`pwZW{9MDy~k17RYoCb)F#!Ml4P|S~jq`?@&z>hNU E0gM8acmMzZ literal 0 HcmV?d00001 diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fe1d1249b38feefbe29bc03d3052345ae9b31a55 GIT binary patch literal 6148 zcmeHK%}T>S5Z<-5-BN@eR6H(tEtnReikDF93mDOZN=;1A(3ma#F^5vfQD4YM@p+ut z-5i3!qlle>-EVe&b~7Jje;8wYxD0!Ybr@p;8X`xfLeO04+OoljT+b1)S&+@AL85|+ ziTwWn&sDnc<20J-ggA^KfEXYK zHjx2)8i@8L7Ed)31H{0O4B-ACp&_~!bAx*8fDW(E=&vK9fR1koL}}5rm>UES2sf#K zCY9SK1~=(omp0C|m>V?djO&$Q9=me!c;R|=uuC1zxNDGlVt^PZGf*?b4xaxP@XJ&_ z^2;eSA_jkG#O8^XTA34xYS5T0$TO(;SSDjpZS7Ho@9#Y?F51&ruHr8cH$FlI}ewue&4QD4YM@p+ut z-H4?XJc-yDnEhtwC(C{bJ6Qk_?Qz%ur~-h4N?34lSRoWAU6O+J5DNVa1w-gU2thDi z%4Wx3WPskC3lr$WJ*4pE{f!1etOAU_1%o&mrS(zU(`IIiP%tGlyV zKE=<8dNE`Q{O+`DSv^nav67>$1gz_?jpCxGMR*bpSiuX{p ZpkI=K=vmARq6dY41T+miFay8Jzy}h^OxFMa literal 0 HcmV?d00001 diff --git a/docs/docker/vector_db/.DS_Store b/docs/docker/vector_db/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4443dc27b92372480885001ef554b73ba7fd7047 GIT binary patch literal 6148 zcmeHKOKQVF43%0h45i>@mUD&PUA1gmxo8b! zqyki6s=#AxJL~_i@H_MWl*AnspaTC&0c|$h%^I(iy><3-)@uv=4Q@3*a5Jo(g5d2K j=?#m}Qbqne{rvJlyZs?5)!NeXinro5yqn-b7r`JL4D)W-zoOZ>)~TA< zp}I)oqSstM*4ZFTvbb*&k|@TI>&qmIbkWtrEXqx8U>Cf)SMN19N2AVOOCIcwTXJ-` z+iuDI_WpQW_cpe7j?Q|I=~Jd(EqwytG$X4HC-8=bsU*tpQiZ>a0*&J(=x3?U?r~1O zHF$&!G!&3RjHTb>*aHP2ws#1I)m07@++@q7wQJ zON(afz`?Wti1Zn01a0aiC`UT<9hMex1x46YM4KvHiy>?}`lXBW9hMetItbT%2zO`U zIuv20(LlY@I8P v&RU6ji%LRqrNyrlG|W+qv2+w~p&CKIBm>cRSXx963V#R~8hBs^{*-}Fg_l)4 literal 0 HcmV?d00001 diff --git a/server/db/.DS_Store b/server/db/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..506f7ffdfde3b5652a90dde7ca1bbecf33d8149c GIT binary patch literal 6148 zcmeHKF;2ul3>?D=osQ;8%KZXASe=d+@Bsv&Kne=FKtY#|Z{OHn5J!%Jh62Wtce1ng zTA3n_0odlHy9Cw%mUKtF`7$>@cc0lwV;n8+=rMNVU4J_a{Ve)8;kjpc#DEU>_>#ZE zj^q;$M?Bz}xWnrY3mh*iRtiV~DIf);z`s?1J=<-0o~Ts{NC7GEp@4rM3f-|boD$>H z!6ilj;(~M-pJSFF7AJ_c;gm>*W|>ONRBI!KWjguN;%dVwG1FmDnfJ8H)+Q8->f}p= z!)l^dDIf(76*$f9+WY?%{hRs!FiATpAO-%E0=C#*Z#R6U)>~&U=e@SjZ|LvF*a+tm lt(X|Cm>X}!7oXyaKl8jcoDw6Qe54cgBjCEoq`+?|um|^j9GUR0z@3@`)Cz-BODPd}%;8O!BWG6T%O4;Y~HL82167BhqT=)i`qkK`{9l3<_S z5`@yCYcVs39u#3x5lyPFPYhww(JpPAYcVru(n08%@jG^AVP7ah&yIGf!$G(Pxn%~J zfn^5rW>}&6fBF6Ue>sUq%m6d+uNV-8w%2aql5A~VSsc|`iTa32LUEbFMGAIwDaKeT d#fPX`&@QQi=vvGSq6LL70-6SHn1Nqq;0w1iO+o+w literal 0 HcmV?d00001 diff --git a/server/knowledge_base/kb_service/base.py b/server/knowledge_base/kb_service/base.py index ca0919e..87f7f9a 100644 --- a/server/knowledge_base/kb_service/base.py +++ b/server/knowledge_base/kb_service/base.py @@ -34,6 +34,7 @@ class SupportedVSType: MILVUS = 'milvus' DEFAULT = 'default' PG = 'pg' + ES = 'es' class KBService(ABC): @@ -239,6 +240,9 @@ class KBServiceFactory: from server.knowledge_base.kb_service.milvus_kb_service import MilvusKBService return MilvusKBService(kb_name, embed_model=embed_model) # other milvus parameters are set in model_config.kbs_config + elif SupportedVSType.ES == vector_store_type: + from server.knowledge_base.kb_service.es_kb_service import ESKBService + return ESKBService(kb_name, embed_model=embed_model) elif SupportedVSType.DEFAULT == vector_store_type: # kb_exists of default kbservice is False, to make validation easier. from server.knowledge_base.kb_service.default_kb_service import DefaultKBService return DefaultKBService(kb_name) diff --git a/server/knowledge_base/kb_service/es_kb_service.py b/server/knowledge_base/kb_service/es_kb_service.py new file mode 100644 index 0000000..1812f2b --- /dev/null +++ b/server/knowledge_base/kb_service/es_kb_service.py @@ -0,0 +1,160 @@ +#!/user/bin/env python3 +""" +File_Name: es_kb_service.py +Author: TangGuoLiang +Email: 896165277@qq.com +Created: 2023-09-05 +""" +from typing import List +import os +import shutil +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores.elasticsearch import ElasticsearchStore +from configs.model_config import KB_ROOT_PATH, EMBEDDING_MODEL, EMBEDDING_DEVICE, CACHED_VS_NUM +from server.knowledge_base.kb_service.base import KBService, SupportedVSType +from server.knowledge_base.utils import load_embeddings +from elasticsearch import Elasticsearch +from configs.model_config import logger +from configs.model_config import kbs_config + +class ESKBService(KBService): + + def do_init(self): + self.kb_path = self.get_kb_path(self.kb_name) + self.index_name = self.kb_path.split("/")[-1] + self.IP = kbs_config[self.vs_type()]['host'] + self.PORT = kbs_config[self.vs_type()]['port'] + self.embeddings_model = load_embeddings(self.embed_model, EMBEDDING_DEVICE) + try: + # ES python客户端连接(仅连接) + self.es_client_python = Elasticsearch(f"{self.IP}:{self.PORT}") + except ConnectionError: + logger.error("连接到 Elasticsearch 失败!") + except Exception as e: + logger.error(f"Error 发生 : {e}") + + try: + # langchain ES 连接、创建索引 + self.db_init = ElasticsearchStore( + es_url=f"{self.IP}:{self.PORT}", + index_name=self.index_name, + query_field="context", + vector_query_field="vector", + embedding=self.embeddings_model, + ) + except ConnectionError: + logger.error("### 连接到 Elasticsearch 失败!") + except Exception as e: + logger.error(f"Error 发生 : {e}") + + @staticmethod + def get_kb_path(knowledge_base_name: str): + return os.path.join(KB_ROOT_PATH, knowledge_base_name) + + @staticmethod + def get_vs_path(knowledge_base_name: str): + return os.path.join(ESKBService.get_kb_path(knowledge_base_name), "vector_store") + + def do_create_kb(self): + if os.path.exists(self.doc_path): + os.makedirs(os.path.join(self.kb_path, "vector_store")) + + def vs_type(self) -> str: + return SupportedVSType.ES + + def _load_es(self, docs, embed_model): + # 将docs写入到ES中 + try: + # 连接 + 同时写入文档 + self.db = ElasticsearchStore.from_documents( + documents=docs, + embedding=embed_model, + es_url= f"{self.IP}:{self.PORT}", + index_name=self.index_name, + distance_strategy="COSINE", + query_field="context", + vector_query_field="vector", + verify_certs=False, + ) + except ConnectionError: + logger.error("连接到 Elasticsearch 失败!") + except Exception as e: + logger.error(f"Error 发生 : {e}") + + + + def do_search(self, query:str, top_k: int, score_threshold: float, embeddings: Embeddings): + # 文本相似性检索 + docs = self.db_init.similarity_search_with_score(query=query, + k=top_k) + return docs + + + def do_delete_doc(self, kb_file, **kwargs): + if self.es_client_python.indices.exists(index=self.index_name): + # 从向量数据库中删除索引(文档名称是Keyword) + query = { + "query": { + "term": { + "metadata.source.keyword": kb_file.filepath + } + } + } + # 注意设置size,默认返回10个。 + search_results = self.es_client_python.search(body=query, size=50) + delete_list = [hit["_id"] for hit in search_results['hits']['hits']] + if len(delete_list) == 0: + return None + else: + for doc_id in delete_list: + try: + self.es_client_python.delete(index=self.index_name, + id=doc_id, + refresh=True) + except Exception as e: + logger.error("ES Docs Delete Error!") + + # self.db_init.delete(ids=delete_list) + #self.es_client_python.indices.refresh(index=self.index_name) + + + def do_add_doc(self, docs: List[Document], **kwargs): + '''向知识库添加文件''' + self._load_es(docs=docs, embed_model=self.embeddings_model) + # 获取 id 和 source , 格式:[{"id": str, "metadata": dict}, ...] + file_path = docs[0].metadata.get("source") + if self.es_client_python.indices.exists(index=self.index_name): + query = { + "query": { + "term": { + "metadata.source.keyword": file_path + } + } + } + search_results = self.es_client_python.search(body=query) + if len(search_results["hits"]["hits"]) == 0: + raise ValueError("召回元素个数为0") + info_docs = [{"id":hit["_id"], "metadata": hit["_source"]["metadata"]} for hit in search_results["hits"]["hits"]] + return info_docs + + + def do_clear_vs(self): + """从知识库删除全部向量""" + if self.es_client_python.indices.exists(index=self.kb_name): + self.es_client_python.indices.delete(index=self.kb_name) + + + def do_drop_kb(self): + """删除知识库""" + # self.kb_file: 知识库路径 + if os.path.exists(self.kb_path): + shutil.rmtree(self.kb_path) + + + + + + + + From 4c5fc6ab0161fa255fa15c403371e65b12829ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=94=90=E5=9B=BD=E6=A2=81?= Date: Thu, 14 Sep 2023 12:38:37 +0800 Subject: [PATCH 2/2] edit model_config.py.example --- .gitignore | 5 ++++- configs/model_config.py.example | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index f5bd3e4..5475aab 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,7 @@ __pycache__/ /knowledge_base/ /configs/*.py .vscode/ -.pytest_cache/ +.pytest_cache +.DS_Store + + diff --git a/configs/model_config.py.example b/configs/model_config.py.example index 39c51ca..e59d0af 100644 --- a/configs/model_config.py.example +++ b/configs/model_config.py.example @@ -123,7 +123,14 @@ kbs_config = { }, "pg": { "connection_uri": "postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat", - } + }, + "es": { + "host": "127.0.0.1", + "port": "9200", + "index_name": "test_index", + "user": "", + "password": "" + } } # 默认向量库类型。可选:faiss, milvus, pg.