title enhancement and remove the logic of query scope

This commit is contained in:
wvivi2023 2023-12-13 09:51:17 +08:00
parent b1abafeb50
commit 9fa7a1fbbf
3 changed files with 34 additions and 11 deletions

BIN
server/.DS_Store vendored

Binary file not shown.

View File

@ -63,15 +63,16 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
#print(f"chain1._call, result:{result},similiarit text:{query1}")
pre_doc = kb.search_docs(query, 1, None)
print(f"len(pre_doc):{len(pre_doc)}")
if len(pre_doc) > 0:
print(f"search_docs, pre_doc:{pre_doc}")
filpath = pre_doc[0][0].metadata['source']
file_name = os.path.basename(filpath)
file_name, file_extension = os.path.splitext(file_name)
query = "根据" +file_name + ""+ query
# pre_doc = kb.search_docs(query, 1, None)
# print(f"len(pre_doc):{len(pre_doc)}")
# if len(pre_doc) > 0:
# print(f"search_docs, pre_doc:{pre_doc}")
# filpath = pre_doc[0][0].metadata['source']
# file_name = os.path.basename(filpath)
# file_name, file_extension = os.path.splitext(file_name)
# query = "根据" +file_name + ""+ query
pre_doc = []
print(f"search_docs, query:{query}")
docs = kb.search_docs(query, top_k, score_threshold)
print(f"search_docs, docs:{docs}")

View File

@ -275,7 +275,9 @@ class KnowledgeFile:
'''
self.kb_name = knowledge_base_name
self.filename = filename
self.ext = os.path.splitext(filename)[-1].lower()
#self.ext = os.path.splitext(filename)[-1].lower()
self.doc_title_name, file_extension = os.path.splitext(filename)
self.ext = file_extension.lower()
if self.ext not in SUPPORTED_EXTS:
raise ValueError(f"暂未支持的文件格式 {self.ext}")
self.filepath = get_file_path(knowledge_base_name, filename)
@ -301,6 +303,14 @@ class KnowledgeFile:
chunk_overlap: int = OVERLAP_SIZE,
text_splitter: TextSplitter = None,
):
def customize_zh_title_enhance(docs: Document) -> Document:
if len(docs) > 0:
for doc in docs:
doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}"
return docs
else:
print("文件不存在")
docs = docs or self.file2docs(refresh=refresh)
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
@ -339,10 +349,22 @@ class KnowledgeFile:
i = i+1
if zh_title_enhance:
docs = func_zh_title_enhance(docs)
docs = customize_zh_title_enhance(docs)
i = 1
outputfile = file_name_without_extension + "_split.txt"
# 打开文件以写入模式
with open(outputfile, 'w') as file:
for doc in docs:
print(f"**********切分段{i}{doc}")
file.write(f"\n**********切分段{i}")
file.write(doc.page_content)
i = i+1
self.splited_docs = docs
return self.splited_docs
def file2text(
self,
zh_title_enhance: bool = ZH_TITLE_ENHANCE,