增加标题增强文档功能
This commit is contained in:
parent
ba6e089f97
commit
b81190005c
|
|
@ -2,3 +2,6 @@ from .ali_text_splitter import AliTextSplitter
|
||||||
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
|
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
|
||||||
from .chinese_text_splitter import ChineseTextSplitter
|
from .chinese_text_splitter import ChineseTextSplitter
|
||||||
from .zh_title_enhance import zh_title_enhance
|
from .zh_title_enhance import zh_title_enhance
|
||||||
|
from .customer_zh_title_enhance import zh_third_title_enhance
|
||||||
|
from .customer_zh_title_enhance import zh_second_title_enhance
|
||||||
|
from .customer_zh_title_enhance import zh_first_title_enhance
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,10 @@ from langchain_community.document_loaders import JSONLoader, TextLoader
|
||||||
|
|
||||||
from chatchat.settings import Settings
|
from chatchat.settings import Settings
|
||||||
from chatchat.server.file_rag.text_splitter import (
|
from chatchat.server.file_rag.text_splitter import (
|
||||||
zh_title_enhance as func_zh_title_enhance,
|
# zh_title_enhance as func_zh_title_enhance,
|
||||||
|
zh_third_title_enhance,
|
||||||
|
zh_second_title_enhance,
|
||||||
|
zh_first_title_enhance
|
||||||
)
|
)
|
||||||
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
|
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
|
||||||
from chatchat.utils import build_logger
|
from chatchat.utils import build_logger
|
||||||
|
|
@ -363,6 +366,14 @@ class KnowledgeFile:
|
||||||
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
|
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
|
||||||
text_splitter: TextSplitter = None,
|
text_splitter: TextSplitter = None,
|
||||||
):
|
):
|
||||||
|
def customize_zh_title_enhance(docs: Document) -> Document:
|
||||||
|
if len(docs) > 0:
|
||||||
|
for doc in docs:
|
||||||
|
doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}"
|
||||||
|
return docs
|
||||||
|
else:
|
||||||
|
print("文件不存在")
|
||||||
|
|
||||||
docs = docs or self.file2docs(refresh=refresh)
|
docs = docs or self.file2docs(refresh=refresh)
|
||||||
if not docs:
|
if not docs:
|
||||||
return []
|
return []
|
||||||
|
|
@ -383,7 +394,12 @@ class KnowledgeFile:
|
||||||
|
|
||||||
print(f"文档切分示例:{docs[0]}")
|
print(f"文档切分示例:{docs[0]}")
|
||||||
if zh_title_enhance:
|
if zh_title_enhance:
|
||||||
docs = func_zh_title_enhance(docs)
|
# docs = func_zh_title_enhance(docs)
|
||||||
|
docs = zh_third_title_enhance(docs)
|
||||||
|
docs = zh_second_title_enhance(docs)
|
||||||
|
docs = zh_first_title_enhance(docs)
|
||||||
|
docs = customize_zh_title_enhance(docs)
|
||||||
|
|
||||||
self.splited_docs = docs
|
self.splited_docs = docs
|
||||||
return self.splited_docs
|
return self.splited_docs
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue