From 8fa99026c83fc6019f8010ad3998638f5bcfbd3d Mon Sep 17 00:00:00 2001 From: WilliamChen-luckbob <58684828+WilliamChen-luckbob@users.noreply.github.com> Date: Thu, 28 Sep 2023 19:18:31 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A0=B9=E6=8D=AE=E5=AE=98=E6=96=B9=E6=96=87?= =?UTF-8?q?=E6=A1=A3=EF=BC=8C=E6=B7=BB=E5=8A=A0=E5=AF=B9=E8=8B=B1=E6=96=87?= =?UTF-8?q?=E7=89=88=E7=9A=84bge=20embedding=E7=9A=84=E6=8C=87=E7=A4=BA?= =?UTF-8?q?=E6=A8=A1=E6=9D=BF=20(#1585)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: zR <2448370773@qq.com> --- server/knowledge_base/kb_cache/base.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/server/knowledge_base/kb_cache/base.py b/server/knowledge_base/kb_cache/base.py index cd60aa4..59426fa 100644 --- a/server/knowledge_base/kb_cache/base.py +++ b/server/knowledge_base/kb_cache/base.py @@ -124,9 +124,18 @@ class EmbeddingsPool(CachePool): if model == "text-embedding-ada-002": # openai text-embedding-ada-002 embeddings = OpenAIEmbeddings(openai_api_key=get_model_path(model), chunk_size=CHUNK_SIZE) elif 'bge-' in model: - embeddings = HuggingFaceBgeEmbeddings(model_name=get_model_path(model), - model_kwargs={'device': device}, - query_instruction="为这个句子生成表示以用于检索相关文章:") + if 'zh' in model: + # for chinese model + query_instruction = "为这个句子生成表示以用于检索相关文章:" + elif 'en' in model: + # for english model + query_instruction = "Represent this sentence for searching relevant passages:" + else: + # maybe ReRanker or else, just use empty string instead + query_instruction = "" + embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_dict[model], + model_kwargs={'device': device}, + query_instruction=query_instruction) if model == "bge-large-zh-noinstruct": # bge large -noinstruct embedding embeddings.query_instruction = "" else: