增加标题增强文档功能

This commit is contained in:
weiweiw 2025-01-13 10:35:45 +08:00
parent ba6e089f97
commit b81190005c
2 changed files with 21 additions and 2 deletions

View File

@ -2,3 +2,6 @@ from .ali_text_splitter import AliTextSplitter
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
from .chinese_text_splitter import ChineseTextSplitter
from .zh_title_enhance import zh_title_enhance
from .customer_zh_title_enhance import zh_third_title_enhance
from .customer_zh_title_enhance import zh_second_title_enhance
from .customer_zh_title_enhance import zh_first_title_enhance

View File

@ -14,7 +14,10 @@ from langchain_community.document_loaders import JSONLoader, TextLoader
from chatchat.settings import Settings
from chatchat.server.file_rag.text_splitter import (
zh_title_enhance as func_zh_title_enhance,
# zh_title_enhance as func_zh_title_enhance,
zh_third_title_enhance,
zh_second_title_enhance,
zh_first_title_enhance
)
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
from chatchat.utils import build_logger
@ -363,6 +366,14 @@ class KnowledgeFile:
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
text_splitter: TextSplitter = None,
):
def customize_zh_title_enhance(docs: Document) -> Document:
if len(docs) > 0:
for doc in docs:
doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}"
return docs
else:
print("文件不存在")
docs = docs or self.file2docs(refresh=refresh)
if not docs:
return []
@ -383,7 +394,12 @@ class KnowledgeFile:
print(f"文档切分示例:{docs[0]}")
if zh_title_enhance:
docs = func_zh_title_enhance(docs)
# docs = func_zh_title_enhance(docs)
docs = zh_third_title_enhance(docs)
docs = zh_second_title_enhance(docs)
docs = zh_first_title_enhance(docs)
docs = customize_zh_title_enhance(docs)
self.splited_docs = docs
return self.splited_docs