diff --git a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/__init__.py b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/__init__.py index c0e418a..7b4afcf 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/__init__.py +++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/__init__.py @@ -2,3 +2,6 @@ from .ali_text_splitter import AliTextSplitter from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter from .chinese_text_splitter import ChineseTextSplitter from .zh_title_enhance import zh_title_enhance +from .customer_zh_title_enhance import zh_third_title_enhance +from .customer_zh_title_enhance import zh_second_title_enhance +from .customer_zh_title_enhance import zh_first_title_enhance diff --git a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py index 21acee7..521d4e5 100644 --- a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py +++ b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py @@ -14,7 +14,10 @@ from langchain_community.document_loaders import JSONLoader, TextLoader from chatchat.settings import Settings from chatchat.server.file_rag.text_splitter import ( - zh_title_enhance as func_zh_title_enhance, + # zh_title_enhance as func_zh_title_enhance, + zh_third_title_enhance, + zh_second_title_enhance, + zh_first_title_enhance ) from chatchat.server.utils import run_in_process_pool, run_in_thread_pool from chatchat.utils import build_logger @@ -363,6 +366,14 @@ class KnowledgeFile: chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE, text_splitter: TextSplitter = None, ): + def customize_zh_title_enhance(docs: Document) -> Document: + if len(docs) > 0: + for doc in docs: + doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}" + return docs + else: + print("文件不存在") + docs = docs or self.file2docs(refresh=refresh) if not docs: return [] @@ -383,7 +394,12 @@ class KnowledgeFile: print(f"文档切分示例:{docs[0]}") if zh_title_enhance: - docs = func_zh_title_enhance(docs) + # docs = func_zh_title_enhance(docs) + docs = zh_third_title_enhance(docs) + docs = zh_second_title_enhance(docs) + docs = zh_first_title_enhance(docs) + docs = customize_zh_title_enhance(docs) + self.splited_docs = docs return self.splited_docs