From 526c4b52a89d74481d45bfaee561bcd36cad6f8d Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Mon, 6 Nov 2023 08:57:58 +0800 Subject: [PATCH] search related doc title before similarity search search related doc title before similarity search --- .DS_Store | Bin 0 -> 6148 bytes configs/prompt_config.py.example | 2 +- server/.DS_Store | Bin 0 -> 6148 bytes server/chat/knowledge_base_chat.py | 17 ++++++++++++++ server/knowledge_base/.DS_Store | Bin 0 -> 6148 bytes server/knowledge_base/__init__.py | 4 ++++ server/knowledge_base/kb_doc_api.py | 15 +++++++++++++ server/knowledge_base/kb_service/.DS_Store | Bin 0 -> 6148 bytes .../kb_service/faiss_kb_service.py | 2 ++ .../kb_service/knowledge_base/.DS_Store | Bin 0 -> 6148 bytes .../kb_service/knowledge_base/test/.DS_Store | Bin 0 -> 6148 bytes server/knowledge_base/utils.py | 14 +++++++++--- test.py | 21 ++++++++++++++++++ .../chinese_recursive_text_splitter.py | 4 ++-- 14 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 .DS_Store create mode 100644 server/.DS_Store create mode 100644 server/knowledge_base/.DS_Store create mode 100644 server/knowledge_base/kb_service/.DS_Store create mode 100644 server/knowledge_base/kb_service/knowledge_base/.DS_Store create mode 100644 server/knowledge_base/kb_service/knowledge_base/test/.DS_Store create mode 100644 test.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..65b9efa74dbb81a742f09b9744074661236e7db6 GIT binary patch literal 6148 zcmeHK%}(1u5S}F|aX>{mP$|cidNm>;e?(lY054Ei?SWRo#wJ)8FN%}+5JmF2ybw>q z^Dwi!k(#9Gp-2sa9clKP-Py7Aea`M?0f6q102Vr7aR-|}80E=VY)g+&lF4U8 zFo4w0(tAH%nQaD)0!D$?rU0$o5?q0R1g_z?^(%bLJo@)>l#Xh(C$?26e%LNqC2QCE z>E4LU9eJa1+Vh4N)cYz#?9XG*{}zQ=zp{TSl93lB;ZPPtL5Nd+e2hSJ zDq)qaa=%iYPV3F4&0DpZ&8PK7``G5~ZfiCxTc183o_y)w#&?PMrDSBd>qz^kaSjg{ zEU}un!z32T05eooK(2Q%PhZ+4D|Z1<>!aijeYTs4~|j(c#6UsXT`F@zA{ z-W)<7eDENJaO2Dy9h-M0@t1t%zj9pCW1D!x5GK@aoc}R??Ss}!f8`6DmiW$B!#dQf z$J$*t!HgRPi~{dg0oorpI-#R6S17j*Y~&RHv53RcFs8c%)o_iD##|w0(3mPkRHaN` zF_;5_WHc{VC`+)>$FU6PD6XPQ!yKmo(b1SI#1S;} OM?lJ88l%8}Rp1#F@(H>C literal 0 HcmV?d00001 diff --git a/configs/prompt_config.py.example b/configs/prompt_config.py.example index a52b1f7..a84ca71 100644 --- a/configs/prompt_config.py.example +++ b/configs/prompt_config.py.example @@ -33,7 +33,7 @@ PROMPT_TEMPLATES["llm_chat"] = { PROMPT_TEMPLATES["knowledge_base_chat"] = { "default": """ - <指令>根据已知信息,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,不允许在答案中添加编造成分,答案请使用中文。 + <指令>完全依据已知信息的内容,以一个电力专家的视角,简洁和专业的来回答问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题”,不允许在答案中添加编造成分,不回答与问题无关的问题,答案请使用中文。 <已知信息>{{ context }}、 <问题>{{ question }} """, diff --git a/server/.DS_Store b/server/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..38c51570b5b8a498f8def3e71f6c264107427666 GIT binary patch literal 6148 zcmeHK%}T>S5T4a)Q$^`P(c^;GimgAj#Y?FA0!H+pQWFz27_+5G&7l->RUb&7#OHBl zcLi&$-iqiB%zXPZv*G(-ce4P%T0Or8PzC@Cov<*6;R~aA@)4WTBLzez*N7khAFeUB zN3zX;QNSqh+Z3R^y9l=sgMbV^?q3gFNFeaCq~k@x!@L)-6NO1yt-i9U>B7ux(JET= z)`fE;GADJ@Uea>AXVg0uBJ$qbu6Gg!S-ZTnFXGe<*hOkkpA##AYyDrNeK!BjbpbDd{v%oVC~VEXaF^p}}_p)mP%T%XHyV75XNjRHo2VFjjD zw@ml{;rRFeaFOX51&jiJN&!|la1L5XN#Cs_ AsyncIterable[str]: + #weiwei + print(f"knowledge_base_chat_iterator,query:{query},model_name:{model_name},prompt_name:{prompt_name}") + callback = AsyncIteratorCallbackHandler() model = get_ChatOpenAI( model_name=model_name, @@ -55,12 +60,21 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入", ) docs = search_docs(query, knowledge_base_name, top_k, score_threshold) context = "\n".join([doc.page_content for doc in docs]) + #weiwei + print(f"knowledge_base_chat_iterator,search docs context:{context}") prompt_template = get_prompt_template("knowledge_base_chat", prompt_name) input_msg = History(role="user", content=prompt_template).to_msg_template(False) + + #weiwei + print(f"knowledge_base_chat_iterator,input_msg:{input_msg}") + chat_prompt = ChatPromptTemplate.from_messages( [i.to_msg_template() for i in history] + [input_msg]) + #weiwei + print(f"knowledge_base_chat_iterator,chat_prompt:{chat_prompt}") + chain = LLMChain(prompt=chat_prompt, llm=model) # Begin a task that runs in the background. @@ -69,6 +83,9 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入", callback.done), ) + #weiwei + print(f"task call end") + source_documents = [] for inum, doc in enumerate(docs): filename = os.path.split(doc.metadata["source"])[-1] diff --git a/server/knowledge_base/.DS_Store b/server/knowledge_base/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4030beb80b4c55370309521970e67de64fd4f2bd GIT binary patch literal 6148 zcmeHK%}xR_5S{|cA~E5h3CAX0F+vOk<7I>R0a1C 0: + print(f"search_docs, len(pre_doc):{len(pre_doc)}") + filpath = pre_doc[0][0].metadata['source'] + file_name = os.path.basename(filpath) + file_name, file_extension = os.path.splitext(file_name) + query = "根据" +file_name + ","+ query + + print(f"search_docs, query:{query}") docs = kb.search_docs(query, top_k, score_threshold) data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs] + # i = 1 + # for x in docs: + # print(f"相似文档 {i}: {x}") + # i = i+1 return data diff --git a/server/knowledge_base/kb_service/.DS_Store b/server/knowledge_base/kb_service/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f5068e34c786f083d7047cfbd43a196b1aac9dd8 GIT binary patch literal 6148 zcmeHKQA@)x5WdW*okQ7!f{y`TCvLcni7#d9AF!eiDz&wBmeppgn|&CAKIyzJ)qjXrSJ+s0>adBzcTDHp8rE@DX zXXp(_X~!F!)9OTs*q_cl|1=6Sx3Yd9lA#wR;Xpb>L5N2#&Y~m`Sx1bLAeFw3GFT<6 z>sMp1V^=&qh6ts(79ZpbiqNFV>zKxyoo9WZJZ{Ew#Hl`?x2{5 NfTY0;hJjyY;03pgkxT#p literal 0 HcmV?d00001 diff --git a/server/knowledge_base/kb_service/faiss_kb_service.py b/server/knowledge_base/kb_service/faiss_kb_service.py index a72fcf7..28de35c 100644 --- a/server/knowledge_base/kb_service/faiss_kb_service.py +++ b/server/knowledge_base/kb_service/faiss_kb_service.py @@ -60,8 +60,10 @@ class FaissKBService(KBService): score_threshold: float = SCORE_THRESHOLD, embeddings: Embeddings = None, ) -> List[Document]: + print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}") with self.load_vector_store().acquire() as vs: docs = vs.similarity_search_with_score(query, k=top_k, score_threshold=score_threshold) + print(f"do_search,docs:{docs}") return docs def do_add_doc(self, diff --git a/server/knowledge_base/kb_service/knowledge_base/.DS_Store b/server/knowledge_base/kb_service/knowledge_base/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..32904a3012664ad46131f43e27b1c1f4b3ed81af GIT binary patch literal 6148 zcmeHK%}T>S5ZAHtJ2 zPricBU?EGwYzl7Z^A%xcZeua<%A%uZS%-PU^Msmip0!ZZ; zUK)43PAlUCA? z>#g3o?41Y^dLz5#od)f+Sy(?5QPK*cc25-qej8FQ&VtAnX_p;0lStuvQ?h1o1dm{h*Aw1U+-uS4 zj)If1j%YN@x5{3h-(Q~6kAvyB|6{y+JAdqZw-<#Xx`FXV^PHFhn#ce$fDA000kb!; z{K98~6F>%#fxpIp+#fioM8{yJQEeU2s4D=V1#Tr^%e#cE;RYRpnMR0!P+bbBOR2eH zP+bmwZt@(1nMPessJVPl^JHpnC{#Ti#^)-XP)8%S$N(}h&A^iG7UcPVH2?lTU4%Vk z02%mK3{b1#HtMh>bGFVbPM)^uHKG&+7G1M=)Lq(y{0IWlv8!hw-zr|)~iuvB`biDYUmonaZ4W7l?9M0VH= zN5fika4CCdLio*+CiIa(Iwej}>i{c5#4dl2{{=)eloUCjJ6G?4*h02x>* z1Lodgxs|ShlOqGjz+YrQ-VYp9qGd4FsICrZR0#l>0=E*dWi26dxIxQctPw^)s7?jc zsnnDhRHuWVn>fp0tWl>EYKjkPR;H#xp=x!QpQ~^}EsfYB1IWNU14-S@%lH4}^ZI{2 z346!@GVre$psBiDufdkg+q$qh`POpK8&D}SF4p*-0)~2uAyz)c1yCj6=h6VQ48|J4 Q1A>196b;xQ13${Z8%_9ZH2?qr literal 0 HcmV?d00001 diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index c73d021..045ff9c 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -15,6 +15,7 @@ from configs import ( import importlib from text_splitter import zh_title_enhance as func_zh_title_enhance import langchain.document_loaders +from langchain.document_loaders.word_document import Docx2txtLoader from langchain.docstore.document import Document from langchain.text_splitter import TextSplitter from pathlib import Path @@ -76,8 +77,9 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'], "UnstructuredFileLoader": ['.eml', '.msg', '.rst', '.rtf', '.txt', '.xml', - '.docx', '.epub', '.odt', + '.epub', '.odt', '.ppt', '.pptx', '.tsv'], + "Docx2txtLoader":['.docx'], } SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist] @@ -281,6 +283,7 @@ class KnowledgeFile: self.splited_docs = None self.document_loader_name = get_LoaderClass(self.ext) self.text_splitter_name = TEXT_SPLITTER_NAME + print(f"KnowledgeFile: filepath:{self.filepath}") def file2docs(self, refresh: bool=False): if self.docs is None or refresh: @@ -312,8 +315,13 @@ class KnowledgeFile: doc.metadata["source"] = os.path.basename(self.filepath) else: docs = text_splitter.split_documents(docs) - - print(f"文档切分示例:{docs[0]}") + + #print(f"文档切分示例:{docs[0]}") + i = 0 + for doc in docs: + print(f"**********切分段{i}:{doc}") + i = i+1 + if zh_title_enhance: docs = func_zh_title_enhance(docs) self.splited_docs = docs diff --git a/test.py b/test.py new file mode 100644 index 0000000..1299786 --- /dev/null +++ b/test.py @@ -0,0 +1,21 @@ + +from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService +from server.knowledge_base import KnowledgeFile + +if __name__ == '__main__': + from pprint import pprint + + #kb_file = KnowledgeFile(filename="test.txt", knowledge_base_name="samples") + # kb_file = KnowledgeFile(filename="国网安徽信通公司安全准入实施要求_修订.docx", knowledge_base_name="test") + # docs = kb_file.file2docs() + # pprint(docs[-1]) + # docs = kb_file.file2text() + # pprint(docs[-1]) + + faissService = FaissKBService("test") + faissService.add_doc(KnowledgeFile("国网安徽信通公司安全准入实施要求_修订.docx", "test")) + # faissService.delete_doc(KnowledgeFile("README.md", "test")) + # faissService.do_drop_kb() + print(faissService.search_docs("准入手续的内容是什么?")) + + diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 70b4b29..d5ee666 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -98,7 +98,7 @@ if __name__ == "__main__": ] # text = """""" for inum, text in enumerate(ls): - print(inum) + print(f"**************分段:{inum}") chunks = text_splitter.split_text(text) for chunk in chunks: - print(chunk) + print(f"**************:chunk:{chunk}")