From 526c4b52a89d74481d45bfaee561bcd36cad6f8d Mon Sep 17 00:00:00 2001
From: wvivi2023 <wang272160587@gmail.com>
Date: Mon, 6 Nov 2023 08:57:58 +0800
Subject: [PATCH] search related doc title before similarity search

search related doc title before similarity search
---
 .DS_Store                                     | Bin 0 -> 6148 bytes
 configs/prompt_config.py.example              |   2 +-
 server/.DS_Store                              | Bin 0 -> 6148 bytes
 server/chat/knowledge_base_chat.py            |  17 ++++++++++++++
 server/knowledge_base/.DS_Store               | Bin 0 -> 6148 bytes
 server/knowledge_base/__init__.py             |   4 ++++
 server/knowledge_base/kb_doc_api.py           |  15 +++++++++++++
 server/knowledge_base/kb_service/.DS_Store    | Bin 0 -> 6148 bytes
 .../kb_service/faiss_kb_service.py            |   2 ++
 .../kb_service/knowledge_base/.DS_Store       | Bin 0 -> 6148 bytes
 .../kb_service/knowledge_base/test/.DS_Store  | Bin 0 -> 6148 bytes
 server/knowledge_base/utils.py                |  14 +++++++++---
 test.py                                       |  21 ++++++++++++++++++
 .../chinese_recursive_text_splitter.py        |   4 ++--
 14 files changed, 73 insertions(+), 6 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 server/.DS_Store
 create mode 100644 server/knowledge_base/.DS_Store
 create mode 100644 server/knowledge_base/kb_service/.DS_Store
 create mode 100644 server/knowledge_base/kb_service/knowledge_base/.DS_Store
 create mode 100644 server/knowledge_base/kb_service/knowledge_base/test/.DS_Store
 create mode 100644 test.py
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..65b9efa74dbb81a742f09b9744074661236e7db6
GIT binary patch
literal 6148
zcmeHK%}(1u5S}F|aX>{mP$|cidNm>;e?(lY054Ei?SWRo#wJ)8FN%}+5JmF2ybw>q
z^Dwi!k(#9Gp-2sa9clKP-Py7Aea`M?0f6<!K^>q102Vr7aR-|}80E=VY)g+&lF4U8
zFo4w0(tAH%nQaD)0!D$?rU0$o5?q0R1g_z?^(%bLJo@)>l#Xh(C$?26e%LNqC2QCE
z>E4LU9eJa1+Vh4N)cYz#?9XG*{}zQ=zp{TSl93lB;ZPPtL5Nd+e2<bqWIZuXf>hSJ
zDq)qaa=%iYPV3F4&0DpZ&8PK7``G5~ZfiCxTc183o_y)w#&?PMrDSBd>qz^kaSjg{
zEU}un!z32T05eooK(2Q%P<zz|xpo>hZ+4D|Z1<>!aijeYTs4~|j(c#6UsXT`F@zA{
z-W)<7eDENJaO2Dy9h-M0@t1t%zj9pCW1D!x5GK@aoc}R??Ss}!f8`6DmiW$B!#dQf
z$J$*t!HgRPi~{dg0oorpI-#R6S17j*Y~&RHv53RcFs8c%)o_iD##|w0(3mPkRHaN`
zF_<dHajxe%8gqrJ9GJd*F#TqxZzxQ@9iPwTbYPA`(;5Yg0*eY1RJTIs|M~j+|6-Bp
z83l|2Z%P4HJaf-_NJ*cqE9B^`<>;5_WHc{VC`+)>$FU6PD6XPQ!yKmo(b1SI#1S;}
OM?lJ88l%8}Rp1#F@(H>C

literal 0
HcmV?d00001

diff --git a/configs/prompt_config.py.example b/configs/prompt_config.py.example
index a52b1f7..a84ca71 100644
--- a/configs/prompt_config.py.example
+++ b/configs/prompt_config.py.example
@@ -33,7 +33,7 @@ PROMPT_TEMPLATES["llm_chat"] = {
 PROMPT_TEMPLATES["knowledge_base_chat"] = {
     "default":
         """
-        <指令>根据已知信息，简洁和专业的来回答问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题”，不允许在答案中添加编造成分，答案请使用中文。 </指令>
+        <指令>完全依据已知信息的内容，以一个电力专家的视角，简洁和专业的来回答问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题”，不允许在答案中添加编造成分，不回答与问题无关的问题，答案请使用中文。 </指令>
         <已知信息>{{ context }}</已知信息>、
         <问题>{{ question }}</问题>
         """,
diff --git a/server/.DS_Store b/server/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..38c51570b5b8a498f8def3e71f6c264107427666
GIT binary patch
literal 6148
zcmeHK%}T>S5T4a)Q$^`P(c^;GimgAj#Y?FA0!H+pQWFz27_+5G&7l->RUb&7#OHBl
zcLi&$-iqiB%zXPZv*G(-ce4P%T0Or8PzC@Cov<*6;R~aA@)4WTBLzez*N7khAFeUB
zN3zX;QNSqh+Z3R^y9l=sgMbV^?q3gFNFeaCq~k@x!@L)-6NO1yt-i9U>B7ux(JET=
z)`fE;GADJ@Uea>AXVg0uBJ$qbu6Gg!S-ZTnFXGe<<De@GLO;Ni^V2Z)Mb;9%*iU4w
zqb979Rce=4`~6y@TH)(Eg9`80w(5-vuQzuFgOatpwy}5AzK!l;@u);)cm_#Z(D)8d
z*!Y&?z0-{&5np2-tuc$dQ)Hh)2v;BYKTn+h3i59v=UvIaxrtJ2Z`D81e^5Nf(kFR_
zD1D-<%1!)t3h4b}&Hp>*hOkk<DDaa4bUtu&LR(|5P#zr^<P!jqej}t|o8A&s!!_C(
zbA>pA##AYyDrNeK!BjbpbDd{v%oVC~VEXaF^p}}_p)mP%T%XHyV75XNjRHo2VFjjD
zw@ml{;rRFeaFOX51&jiJN&!|la1L5XN#Cs_<mj&D=$GhZl$R@%B^dN|EE{wcucAxC
ZIZh2kTVt*eN6^fVfRw=mqrgNJcmvI5j$8l$

literal 0
HcmV?d00001

diff --git a/server/chat/knowledge_base_chat.py b/server/chat/knowledge_base_chat.py
index 19ca871..9d2cec7 100644
--- a/server/chat/knowledge_base_chat.py
+++ b/server/chat/knowledge_base_chat.py
@@ -39,6 +39,8 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
         return BaseResponse(code=404, msg=f"未找到知识库 {knowledge_base_name}")
 
     history = [History.from_data(h) for h in history]
+    #weiwei
+    print(f"server/chat/knowledge_base_chat function, history:{history}")
 
     async def knowledge_base_chat_iterator(query: str,
                                            top_k: int,
@@ -46,6 +48,9 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
                                            model_name: str = LLM_MODEL,
                                            prompt_name: str = prompt_name,
                                            ) -> AsyncIterable[str]:
+        #weiwei
+        print(f"knowledge_base_chat_iterator,query:{query},model_name:{model_name},prompt_name:{prompt_name}")
+
         callback = AsyncIteratorCallbackHandler()
         model = get_ChatOpenAI(
             model_name=model_name,
@@ -55,12 +60,21 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
         )
         docs = search_docs(query, knowledge_base_name, top_k, score_threshold)
         context = "\n".join([doc.page_content for doc in docs])
+        #weiwei
+        print(f"knowledge_base_chat_iterator,search docs context:{context}")
 
         prompt_template = get_prompt_template("knowledge_base_chat", prompt_name)
         input_msg = History(role="user", content=prompt_template).to_msg_template(False)
+
+        #weiwei
+        print(f"knowledge_base_chat_iterator,input_msg:{input_msg}")
+
         chat_prompt = ChatPromptTemplate.from_messages(
             [i.to_msg_template() for i in history] + [input_msg])
 
+        #weiwei
+        print(f"knowledge_base_chat_iterator,chat_prompt:{chat_prompt}")
+
         chain = LLMChain(prompt=chat_prompt, llm=model)
 
         # Begin a task that runs in the background.
@@ -69,6 +83,9 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
             callback.done),
         )
 
+        #weiwei
+        print(f"task call end")
+        
         source_documents = []
         for inum, doc in enumerate(docs):
             filename = os.path.split(doc.metadata["source"])[-1]
diff --git a/server/knowledge_base/.DS_Store b/server/knowledge_base/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..4030beb80b4c55370309521970e67de64fd4f2bd
GIT binary patch
literal 6148
zcmeHK%}xR_5S{|cA~E5h3CAX0F+vOk<7I>R0<O`68f0ao8`dph`8g1hJ?jhkBtDNb
zZ6P7x!HY36lT5#Mezxh?O{WC_(Hi$^03`r$Pzf^yY&Hn_N!KJ}ErddUBY+`HAcQ9f
z)}q<)7a5?v+k-y%(18SZZMlE`G#U7j^3ng{527#`RjZ#Olg;hy=AFD#bRN7HHT6c_
z(Ku;!hj%o(Rx0ur{jPr#2GdUI;6lZtZWsqcoe=f{47tAz<DQzf)Hv=XI@dD}r{EMi
zrSfc6YgQ}rxIVAQS?#pZtjNY$eLgQZhes#pSDk+J8ml)$sK9?u%f7`0yklqM+9uvG
zj#T`N{zdL0hmjdz2AF|uV8EVzPH`I+%WGr?n1LTLK<9%*C3G#O2KCW_4P76}pCcr}
zKD{LfrA60bY7jjr!lWXaRAHYO!la{J+Bnx@YS5&E&@<zA?99TxP=uZx?NWz>a1C<H
z3@`($3}nr)MD_pj`}hB95|5YxX5e2jAaZT5-NGf=+Pbzls<jgJ5tW4EQiIDB?C4UA
fu~dr7s9Ml2se|ZRObwz1g)ah{25y*vUuED61n5n(

literal 0
HcmV?d00001

diff --git a/server/knowledge_base/__init__.py b/server/knowledge_base/__init__.py
index 19de504..727debd 100644
--- a/server/knowledge_base/__init__.py
+++ b/server/knowledge_base/__init__.py
@@ -1,3 +1,7 @@
 # from .kb_api import list_kbs, create_kb, delete_kb
 # from .kb_doc_api import list_docs, upload_doc, delete_doc, update_doc, download_doc, recreate_vector_store
 # from .utils import KnowledgeFile, KBServiceFactory
+
+from server.knowledge_base.kb_doc_api import *
+from server.knowledge_base.kb_api import  *
+from server.knowledge_base.utils import *
\ No newline at end of file
diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py
index 3d01f9e..831b139 100644
--- a/server/knowledge_base/kb_doc_api.py
+++ b/server/knowledge_base/kb_doc_api.py
@@ -29,8 +29,23 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
     kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
     if kb is None:
         return []
+   # query = "根据国网安徽信通公司安全准入实施要求，" + query
+    pre_doc = kb.search_docs(query, 1)
+    print(f"len(pre_doc):{len(pre_doc)}")
+    if len(pre_doc) > 0:
+        print(f"search_docs, len(pre_doc):{len(pre_doc)}")
+        filpath = pre_doc[0][0].metadata['source']
+        file_name = os.path.basename(filpath)
+        file_name, file_extension = os.path.splitext(file_name)
+        query = "根据" +file_name + "，"+ query
+    
+    print(f"search_docs, query:{query}")
     docs = kb.search_docs(query, top_k, score_threshold)
     data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
+    # i = 1
+    # for x in docs:
+    #     print(f"相似文档 {i}: {x}")
+    #     i = i+1
 
     return data
 
diff --git a/server/knowledge_base/kb_service/.DS_Store b/server/knowledge_base/kb_service/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f5068e34c786f083d7047cfbd43a196b1aac9dd8
GIT binary patch
literal 6148
zcmeHKQA@)x5WdW*okQ7!f{y`TCvLcni7#d9AF!eiDz&wBmeppgn|&CAKI<Rym-u_U
zOEOTWf(R-j2bV9&<y!88+~oj(bw)uQpaK9EDq(R2hYy78q%%^`J4Hk$zY#$fQgFfb
zvzgOo!!TeN_-_o*-7Ud22uR=tUftic@5jPN>yzJ)qjXrSJ+s0>adBzcTDHp8rE@DX
zXXp(_X~!F!)9OTs*q_cl|1=6Sx3Yd9lA#wR;Xpb>L5N2#&Y~m`Sx1bLAeFw3GFT<6
z<W@Gv<9ee}<;~ip%E$HHR=dhud(FwDWNmEi>>s<m_%0C-ib{r=N6M<kDLmrjgUIfk
zK@y9ki&pYngC0gMAcGh}2rz$zn7@4RFpq_P_Qx!=YgNA8{3#1x#`unpDSHOzp8<Vy
ztn&PaY9cfY7zVz{0ObdcN@#1$70T3sgKPm1=@}sf=hRD(57%gG%oU;qg(+7=<;rx6
z!IV4hbA8U%m@8E7z;yG$^q-mTP?-EXexJ+Zz-)zPGz=I9-ZHSDmKCc1N1ylqZ=KA_
zFkl#%D+XBc&^hd2NV>Mp1V^=&qh6ts(79ZpbiqNFV>zKxyoo9WZJZ{Ew#Hl`?x2{5
NfTY0;hJjyY;03pgkxT#p

literal 0
HcmV?d00001

diff --git a/server/knowledge_base/kb_service/faiss_kb_service.py b/server/knowledge_base/kb_service/faiss_kb_service.py
index a72fcf7..28de35c 100644
--- a/server/knowledge_base/kb_service/faiss_kb_service.py
+++ b/server/knowledge_base/kb_service/faiss_kb_service.py
@@ -60,8 +60,10 @@ class FaissKBService(KBService):
                   score_threshold: float = SCORE_THRESHOLD,
                   embeddings: Embeddings = None,
                   ) -> List[Document]:
+        print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}")
         with self.load_vector_store().acquire() as vs:
             docs = vs.similarity_search_with_score(query, k=top_k, score_threshold=score_threshold)
+        print(f"do_search,docs:{docs}")
         return docs
 
     def do_add_doc(self,
diff --git a/server/knowledge_base/kb_service/knowledge_base/.DS_Store b/server/knowledge_base/kb_service/knowledge_base/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..32904a3012664ad46131f43e27b1c1f4b3ed81af
GIT binary patch
literal 6148
zcmeHK%}T>S5Z<j(Hx!`<g&r5YR&0?L#Y?R91&ruHr8cH$FwK@GHHT8jS;4a>AHtJ2
zPricB<LpmMORYyMG6S>U?EGwYzl7Z^A%xcZeua<%A%uZS%-PU<BQh>^Msmip0!ZZ;
zUK)43P<YUPo5@DUUt~bOJBuWwO=6NvzCW)U260j@f70BNwY;**R#~21y0;>AlUCA?
z>#g3o?41Y^dLz5#od)f+Sy(?5QPK*cc25-qej8FQ&VtAnX<hUqKUTG_4ltY9%|dZ7
zsFq87t5zMB_@J^=+b!|hUUfLM*~aGf{&Djz>_p;0lStuvQ?h1o1dm{h*Aw1U+-uS4
zj)If1j%YN@x5{3h-(Q~6kAvyB|6{y+JAdqZw-<#Xx`FXV^PHFhn#ce$fDA000kb!;
z{K98~6F>%#fxpIp+#fioM8{yJQEeU2s4D=V1#Tr^%e#cE;RYRpnMR0!P+bbBOR2eH
zP+bmwZt@(1nMPessJVPl^JHpnC{#Ti#^)-XP)8%S$N(}h&A^iG7UcPVH2?lTU4%Vk
z02%mK3{b1#HtMh>bGFVbPM)<K^a@mp%*!;6OTbV^G33glxCp8Q{9GD<j=@YLctG$+
NK+%8=GVrSmd;w2<XGZ`4

literal 0
HcmV?d00001

diff --git a/server/knowledge_base/kb_service/knowledge_base/test/.DS_Store b/server/knowledge_base/kb_service/knowledge_base/test/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..e0859a5c250f95a6baea80c95d14c2fa52e1736c
GIT binary patch
literal 6148
zcmeHK%}xR_5N-jXgqU#9$gzo6j1U9Scv%r&z%_bMgDgv6<FX~NxCcVAXH9$wU&0sg
z6?`72KNt{<UNCBAlIb^{&b0LFrez2rv^H|fgyab!3{+ww3(XRd<5CwSVLWL-s^(~u
zfP_vo>^uHKG&+7G1M=)Lq(y{0IWlv8!hw-zr|)~iuvB`biDYUmonaZ4W7l?9M0VH=
zN5fika4CCdLio<qZaU{)J8I-NjzthQy`Vi%37*@AkgE$Xa79!Tqre@iTw4#YEXy|X
zh4HvjD)OCbWm4qh@?Len$g2mH$t24*w|5Uu8@GNh5cir&3SXU)b%RrQ0OP~1{7%_!
ziC%vivKY}5^}6G$eX~|~>*+CiIa(Iwej}>i{c5#4dl2{{=)eloUCjJ6G?4*h02x>*
z1Lodgxs|ShlOqGjz+YrQ-VYp9qGd4FsICrZR0#l>0=E*dWi26dxIxQctPw^)s7?jc
zsnnDhRHuWVn>fp0tWl>EYKjkPR;H#xp=x!QpQ~^}EsfYB1IWNU14-S@%lH4}^ZI{2
z346!@GVre$psBiDufdkg+q$qh`POpK8&D}SF4p*-0)~2uAyz)c1yCj6=h6VQ48|J4
Q1A>196b;xQ13${Z8%_9ZH2?qr

literal 0
HcmV?d00001

diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py
index c73d021..045ff9c 100644
--- a/server/knowledge_base/utils.py
+++ b/server/knowledge_base/utils.py
@@ -15,6 +15,7 @@ from configs import (
 import importlib
 from text_splitter import zh_title_enhance as func_zh_title_enhance
 import langchain.document_loaders
+from langchain.document_loaders.word_document import Docx2txtLoader 
 from langchain.docstore.document import Document
 from langchain.text_splitter import TextSplitter
 from pathlib import Path
@@ -76,8 +77,9 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
                "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'],
                "UnstructuredFileLoader": ['.eml', '.msg', '.rst',
                                           '.rtf', '.txt', '.xml',
-                                          '.docx', '.epub', '.odt',
+                                          '.epub', '.odt',
                                           '.ppt', '.pptx', '.tsv'],
+               "Docx2txtLoader":['.docx'],
                }
 SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
 
@@ -281,6 +283,7 @@ class KnowledgeFile:
         self.splited_docs = None
         self.document_loader_name = get_LoaderClass(self.ext)
         self.text_splitter_name = TEXT_SPLITTER_NAME
+        print(f"KnowledgeFile: filepath:{self.filepath}")
 
     def file2docs(self, refresh: bool=False):
         if self.docs is None or refresh:
@@ -312,8 +315,13 @@ class KnowledgeFile:
                         doc.metadata["source"] = os.path.basename(self.filepath)
             else:
                 docs = text_splitter.split_documents(docs)
-
-        print(f"文档切分示例：{docs[0]}")
+                
+        #print(f"文档切分示例：{docs[0]}")
+        i = 0
+        for doc in docs:
+            print(f"**********切分段{i}：{doc}")
+            i = i+1
+           
         if zh_title_enhance:
             docs = func_zh_title_enhance(docs)
         self.splited_docs = docs
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..1299786
--- /dev/null
+++ b/test.py
@@ -0,0 +1,21 @@
+
+from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
+from server.knowledge_base import KnowledgeFile
+
+if __name__ == '__main__':
+    from pprint import pprint
+
+    #kb_file = KnowledgeFile(filename="test.txt", knowledge_base_name="samples")
+    # kb_file = KnowledgeFile(filename="国网安徽信通公司安全准入实施要求_修订.docx", knowledge_base_name="test")
+    # docs = kb_file.file2docs()
+    # pprint(docs[-1])
+    # docs = kb_file.file2text()
+    # pprint(docs[-1])
+
+    faissService = FaissKBService("test")
+    faissService.add_doc(KnowledgeFile("国网安徽信通公司安全准入实施要求_修订.docx", "test"))
+    # faissService.delete_doc(KnowledgeFile("README.md", "test"))
+    # faissService.do_drop_kb()
+    print(faissService.search_docs("准入手续的内容是什么？"))
+
+
diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py
index 70b4b29..d5ee666 100644
--- a/text_splitter/chinese_recursive_text_splitter.py
+++ b/text_splitter/chinese_recursive_text_splitter.py
@@ -98,7 +98,7 @@ if __name__ == "__main__":
         ]
     # text = """"""
     for inum, text in enumerate(ls):
-        print(inum)
+        print(f"**************分段：{inum}")
         chunks = text_splitter.split_text(text)
         for chunk in chunks:
-            print(chunk)
+            print(f"**************:chunk:{chunk}")