0.2.6 enhance

0.2.6 enhance
This commit is contained in:
wvivi2023 2023-11-13 09:20:19 +08:00
parent 4cdd2a5e79
commit 60a12c05f6
12 changed files with 88 additions and 30 deletions

BIN
.DS_Store vendored

Binary file not shown.

BIN
img/siji.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

BIN
server/.DS_Store vendored

Binary file not shown.

View File

@ -41,11 +41,13 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
print(f"search_docs, query:{query}")
docs = kb.search_docs(query, top_k, score_threshold)
if len(pre_doc) > 0:
if docs is not None:
docs.append(pre_doc[0])
else:
docs = pre_doc[0]
data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
# i = 1
# for x in docs:
# print(f"相似文档 {i}: {x}")
# i = i+1
return data

View File

@ -63,7 +63,7 @@ class FaissKBService(KBService):
print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}")
with self.load_vector_store().acquire() as vs:
docs = vs.similarity_search_with_score(query, k=top_k, score_threshold=score_threshold)
print(f"do_search,docs:{docs}")
#print(f"do_search,docs:{docs}")
return docs
def do_add_doc(self,

View File

@ -68,7 +68,7 @@ def load_embeddings(model: str = EMBEDDING_MODEL, device: str = embedding_device
from server.knowledge_base.kb_cache.base import embeddings_pool
return embeddings_pool.load_embeddings(model=model, device=device)
#PDFPlumberLoader
LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
"UnstructuredMarkdownLoader": ['.md'],
"CustomJSONLoader": [".json"],
@ -302,6 +302,8 @@ class KnowledgeFile:
text_splitter: TextSplitter = None,
):
docs = docs or self.file2docs(refresh=refresh)
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
if not docs:
return []
if self.ext not in [".csv"]:
@ -314,13 +316,27 @@ class KnowledgeFile:
if doc.metadata:
doc.metadata["source"] = os.path.basename(self.filepath)
else:
print(f"**********************docs2texts: text_splitter.split_documents(docs)")
outputfile = file_name_without_extension + "_source.txt"
with open(outputfile, 'w') as file:
for doc in docs:
file.write(doc.page_content)
docs = text_splitter.split_documents(docs)
#print(f"文档切分示例:{docs[0]}")
i = 0
for doc in docs:
print(f"**********切分段{i}{doc}")
i = i+1
# print(f"KnowledgeFile: filepath:{self.filepath}")
# file_name_without_extension, file_extension = os.path.splitext(self.filepath)
# print("filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
i = 1
outputfile = file_name_without_extension + "_split.txt"
# 打开文件以写入模式
with open(outputfile, 'w') as file:
for doc in docs:
print(f"**********切分段{i}{doc}")
file.write(f"分段{i}")
file.write(doc.page_content)
i = i+1
if zh_title_enhance:
docs = func_zh_title_enhance(docs)
@ -407,7 +423,8 @@ if __name__ == "__main__":
kb_file = KnowledgeFile(filename="test.txt", knowledge_base_name="samples")
# kb_file.text_splitter_name = "RecursiveCharacterTextSplitter"
docs = kb_file.file2docs()
pprint(docs[-1])
#pprint(docs[-1])
docs = kb_file.file2text()
pprint(docs[-1])
docs = kb_file.docs2texts()
#docs = kb_file.file2text()
#pprint(docs[-1])

View File

@ -13,9 +13,9 @@ if __name__ == '__main__':
# pprint(docs[-1])
faissService = FaissKBService("test")
faissService.add_doc(KnowledgeFile("国网安徽信通公司安全准入实施要求_修订.docx", "test"))
faissService.add_doc(KnowledgeFile("电力电缆故障测寻车技术规范.docx", "test"))
# faissService.delete_doc(KnowledgeFile("README.md", "test"))
# faissService.do_drop_kb()
print(faissService.search_docs("准入手续的内容是什么?"))
#print(faissService.search_docs("准入手续的内容是什么?"))

File diff suppressed because one or more lines are too long

View File

@ -17,7 +17,7 @@ if __name__ == "__main__":
menu_items={
'Get Help': 'https://github.com/chatchat-space/Langchain-Chatchat',
'Report a bug': "https://github.com/chatchat-space/Langchain-Chatchat/issues",
'About': f"""欢迎使用 Langchain-Chatchat WebUI {VERSION}"""
'About': f"""欢迎使用 思极大模型 WebUI {VERSION}"""
}
)
@ -36,7 +36,8 @@ if __name__ == "__main__":
st.image(
os.path.join(
"img",
"logo-long-chatchat-trans-v2.png"
"siji.jpg"
#"logo-long-chatchat-trans-v2.png"
),
use_column_width=True
)

Binary file not shown.