title enhancement and remove the logic of query scope

This commit is contained in:
wvivi2023 2023-12-13 09:51:17 +08:00
parent b1abafeb50
commit 9fa7a1fbbf
3 changed files with 34 additions and 11 deletions

BIN
server/.DS_Store vendored

Binary file not shown.

View File

@ -63,15 +63,16 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
#print(f"chain1._call, result:{result},similiarit text:{query1}") #print(f"chain1._call, result:{result},similiarit text:{query1}")
pre_doc = kb.search_docs(query, 1, None) # pre_doc = kb.search_docs(query, 1, None)
print(f"len(pre_doc):{len(pre_doc)}") # print(f"len(pre_doc):{len(pre_doc)}")
if len(pre_doc) > 0: # if len(pre_doc) > 0:
print(f"search_docs, pre_doc:{pre_doc}") # print(f"search_docs, pre_doc:{pre_doc}")
filpath = pre_doc[0][0].metadata['source'] # filpath = pre_doc[0][0].metadata['source']
file_name = os.path.basename(filpath) # file_name = os.path.basename(filpath)
file_name, file_extension = os.path.splitext(file_name) # file_name, file_extension = os.path.splitext(file_name)
query = "根据" +file_name + ""+ query # query = "根据" +file_name + ""+ query
pre_doc = []
print(f"search_docs, query:{query}") print(f"search_docs, query:{query}")
docs = kb.search_docs(query, top_k, score_threshold) docs = kb.search_docs(query, top_k, score_threshold)
print(f"search_docs, docs:{docs}") print(f"search_docs, docs:{docs}")

View File

@ -275,7 +275,9 @@ class KnowledgeFile:
''' '''
self.kb_name = knowledge_base_name self.kb_name = knowledge_base_name
self.filename = filename self.filename = filename
self.ext = os.path.splitext(filename)[-1].lower() #self.ext = os.path.splitext(filename)[-1].lower()
self.doc_title_name, file_extension = os.path.splitext(filename)
self.ext = file_extension.lower()
if self.ext not in SUPPORTED_EXTS: if self.ext not in SUPPORTED_EXTS:
raise ValueError(f"暂未支持的文件格式 {self.ext}") raise ValueError(f"暂未支持的文件格式 {self.ext}")
self.filepath = get_file_path(knowledge_base_name, filename) self.filepath = get_file_path(knowledge_base_name, filename)
@ -301,6 +303,14 @@ class KnowledgeFile:
chunk_overlap: int = OVERLAP_SIZE, chunk_overlap: int = OVERLAP_SIZE,
text_splitter: TextSplitter = None, text_splitter: TextSplitter = None,
): ):
def customize_zh_title_enhance(docs: Document) -> Document:
if len(docs) > 0:
for doc in docs:
doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}"
return docs
else:
print("文件不存在")
docs = docs or self.file2docs(refresh=refresh) docs = docs or self.file2docs(refresh=refresh)
file_name_without_extension, file_extension = os.path.splitext(self.filepath) file_name_without_extension, file_extension = os.path.splitext(self.filepath)
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}") print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
@ -337,11 +347,23 @@ class KnowledgeFile:
file.write(f"\n**********切分段{i}") file.write(f"\n**********切分段{i}")
file.write(doc.page_content) file.write(doc.page_content)
i = i+1 i = i+1
if zh_title_enhance: if zh_title_enhance:
docs = func_zh_title_enhance(docs) docs = customize_zh_title_enhance(docs)
i = 1
outputfile = file_name_without_extension + "_split.txt"
# 打开文件以写入模式
with open(outputfile, 'w') as file:
for doc in docs:
print(f"**********切分段{i}{doc}")
file.write(f"\n**********切分段{i}")
file.write(doc.page_content)
i = i+1
self.splited_docs = docs self.splited_docs = docs
return self.splited_docs return self.splited_docs
def file2text( def file2text(
self, self,