增加标题增强文档功能

This commit is contained in:
weiweiw 2025-01-13 10:56:24 +08:00
parent b81190005c
commit d096443b03
1 changed files with 9 additions and 1 deletions

View File

@ -21,7 +21,7 @@ from chatchat.server.file_rag.text_splitter import (
)
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
from chatchat.utils import build_logger
import re
logger = build_logger()
@ -333,6 +333,7 @@ class KnowledgeFile:
self.kb_name = knowledge_base_name
self.filename = str(Path(filename).as_posix())
self.ext = os.path.splitext(filename)[-1].lower()
self.doc_title_name, file_extension = os.path.splitext(filename)
if self.ext not in SUPPORTED_EXTS:
raise ValueError(f"暂未支持的文件格式 {self.filename}")
self.loader_kwargs = loader_kwargs
@ -341,6 +342,7 @@ class KnowledgeFile:
self.splited_docs = None
self.document_loader_name = get_LoaderClass(self.ext)
self.text_splitter_name = Settings.kb_settings.TEXT_SPLITTER_NAME
print(f"KnowledgeFile: filepath:{self.filepath}")
def file2docs(self, refresh: bool = False):
if self.docs is None or refresh:
@ -366,6 +368,7 @@ class KnowledgeFile:
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
text_splitter: TextSplitter = None,
):
#add the title name on every paragraph, by weiweiwang 2025/1/13
def customize_zh_title_enhance(docs: Document) -> Document:
if len(docs) > 0:
for doc in docs:
@ -375,6 +378,11 @@ class KnowledgeFile:
print("文件不存在")
docs = docs or self.file2docs(refresh=refresh)
#remove the redundant line break after loading, by weiweiwang 2025/1/13
for doc in docs:
if doc.page_content.strip() != "":
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
if not docs:
return []
if self.ext not in [".csv"]: