From 9fa7a1fbbf8b21d6fb86d58540248fe3381b8d07 Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Wed, 13 Dec 2023 09:51:17 +0800 Subject: [PATCH] title enhancement and remove the logic of query scope --- server/.DS_Store | Bin 6148 -> 6148 bytes server/knowledge_base/kb_doc_api.py | 17 +++++++++-------- server/knowledge_base/utils.py | 28 +++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/server/.DS_Store b/server/.DS_Store index 0916aacee0541069fcbfb14b03a08071aebd508e..f0b0b295fc3ec167bbed2c64aedbef4b42d9c709 100644 GIT binary patch delta 69 zcmZoMXffDe&cwK5vISG2x 0: - print(f"search_docs, pre_doc:{pre_doc}") - filpath = pre_doc[0][0].metadata['source'] - file_name = os.path.basename(filpath) - file_name, file_extension = os.path.splitext(file_name) - query = "根据" +file_name + ","+ query + # pre_doc = kb.search_docs(query, 1, None) + # print(f"len(pre_doc):{len(pre_doc)}") + # if len(pre_doc) > 0: + # print(f"search_docs, pre_doc:{pre_doc}") + # filpath = pre_doc[0][0].metadata['source'] + # file_name = os.path.basename(filpath) + # file_name, file_extension = os.path.splitext(file_name) + # query = "根据" +file_name + ","+ query + pre_doc = [] print(f"search_docs, query:{query}") docs = kb.search_docs(query, top_k, score_threshold) print(f"search_docs, docs:{docs}") diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index e91587b..448422c 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -275,7 +275,9 @@ class KnowledgeFile: ''' self.kb_name = knowledge_base_name self.filename = filename - self.ext = os.path.splitext(filename)[-1].lower() + #self.ext = os.path.splitext(filename)[-1].lower() + self.doc_title_name, file_extension = os.path.splitext(filename) + self.ext = file_extension.lower() if self.ext not in SUPPORTED_EXTS: raise ValueError(f"暂未支持的文件格式 {self.ext}") self.filepath = get_file_path(knowledge_base_name, filename) @@ -301,6 +303,14 @@ class KnowledgeFile: chunk_overlap: int = OVERLAP_SIZE, text_splitter: TextSplitter = None, ): + def customize_zh_title_enhance(docs: Document) -> Document: + if len(docs) > 0: + for doc in docs: + doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}" + return docs + else: + print("文件不存在") + docs = docs or self.file2docs(refresh=refresh) file_name_without_extension, file_extension = os.path.splitext(self.filepath) print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}") @@ -337,11 +347,23 @@ class KnowledgeFile: file.write(f"\n**********切分段{i}") file.write(doc.page_content) i = i+1 - + if zh_title_enhance: - docs = func_zh_title_enhance(docs) + docs = customize_zh_title_enhance(docs) + i = 1 + outputfile = file_name_without_extension + "_split.txt" + # 打开文件以写入模式 + with open(outputfile, 'w') as file: + for doc in docs: + print(f"**********切分段{i}:{doc}") + file.write(f"\n**********切分段{i}") + file.write(doc.page_content) + i = i+1 + self.splited_docs = docs return self.splited_docs + + def file2text( self,