增加标题增强文档功能
This commit is contained in:
parent
b81190005c
commit
d096443b03
|
|
@ -21,7 +21,7 @@ from chatchat.server.file_rag.text_splitter import (
|
|||
)
|
||||
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
|
||||
from chatchat.utils import build_logger
|
||||
|
||||
import re
|
||||
|
||||
logger = build_logger()
|
||||
|
||||
|
|
@ -333,6 +333,7 @@ class KnowledgeFile:
|
|||
self.kb_name = knowledge_base_name
|
||||
self.filename = str(Path(filename).as_posix())
|
||||
self.ext = os.path.splitext(filename)[-1].lower()
|
||||
self.doc_title_name, file_extension = os.path.splitext(filename)
|
||||
if self.ext not in SUPPORTED_EXTS:
|
||||
raise ValueError(f"暂未支持的文件格式 {self.filename}")
|
||||
self.loader_kwargs = loader_kwargs
|
||||
|
|
@ -341,6 +342,7 @@ class KnowledgeFile:
|
|||
self.splited_docs = None
|
||||
self.document_loader_name = get_LoaderClass(self.ext)
|
||||
self.text_splitter_name = Settings.kb_settings.TEXT_SPLITTER_NAME
|
||||
print(f"KnowledgeFile: filepath:{self.filepath}")
|
||||
|
||||
def file2docs(self, refresh: bool = False):
|
||||
if self.docs is None or refresh:
|
||||
|
|
@ -366,6 +368,7 @@ class KnowledgeFile:
|
|||
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
|
||||
text_splitter: TextSplitter = None,
|
||||
):
|
||||
#add the title name on every paragraph, by weiweiwang 2025/1/13
|
||||
def customize_zh_title_enhance(docs: Document) -> Document:
|
||||
if len(docs) > 0:
|
||||
for doc in docs:
|
||||
|
|
@ -375,6 +378,11 @@ class KnowledgeFile:
|
|||
print("文件不存在")
|
||||
|
||||
docs = docs or self.file2docs(refresh=refresh)
|
||||
#remove the redundant line break after loading, by weiweiwang 2025/1/13
|
||||
for doc in docs:
|
||||
if doc.page_content.strip() != "":
|
||||
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||
|
||||
if not docs:
|
||||
return []
|
||||
if self.ext not in [".csv"]:
|
||||
|
|
|
|||
Loading…
Reference in New Issue