增加标题增强文档功能
This commit is contained in:
parent
b81190005c
commit
d096443b03
|
|
@ -21,7 +21,7 @@ from chatchat.server.file_rag.text_splitter import (
|
||||||
)
|
)
|
||||||
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
|
from chatchat.server.utils import run_in_process_pool, run_in_thread_pool
|
||||||
from chatchat.utils import build_logger
|
from chatchat.utils import build_logger
|
||||||
|
import re
|
||||||
|
|
||||||
logger = build_logger()
|
logger = build_logger()
|
||||||
|
|
||||||
|
|
@ -333,6 +333,7 @@ class KnowledgeFile:
|
||||||
self.kb_name = knowledge_base_name
|
self.kb_name = knowledge_base_name
|
||||||
self.filename = str(Path(filename).as_posix())
|
self.filename = str(Path(filename).as_posix())
|
||||||
self.ext = os.path.splitext(filename)[-1].lower()
|
self.ext = os.path.splitext(filename)[-1].lower()
|
||||||
|
self.doc_title_name, file_extension = os.path.splitext(filename)
|
||||||
if self.ext not in SUPPORTED_EXTS:
|
if self.ext not in SUPPORTED_EXTS:
|
||||||
raise ValueError(f"暂未支持的文件格式 {self.filename}")
|
raise ValueError(f"暂未支持的文件格式 {self.filename}")
|
||||||
self.loader_kwargs = loader_kwargs
|
self.loader_kwargs = loader_kwargs
|
||||||
|
|
@ -341,6 +342,7 @@ class KnowledgeFile:
|
||||||
self.splited_docs = None
|
self.splited_docs = None
|
||||||
self.document_loader_name = get_LoaderClass(self.ext)
|
self.document_loader_name = get_LoaderClass(self.ext)
|
||||||
self.text_splitter_name = Settings.kb_settings.TEXT_SPLITTER_NAME
|
self.text_splitter_name = Settings.kb_settings.TEXT_SPLITTER_NAME
|
||||||
|
print(f"KnowledgeFile: filepath:{self.filepath}")
|
||||||
|
|
||||||
def file2docs(self, refresh: bool = False):
|
def file2docs(self, refresh: bool = False):
|
||||||
if self.docs is None or refresh:
|
if self.docs is None or refresh:
|
||||||
|
|
@ -366,6 +368,7 @@ class KnowledgeFile:
|
||||||
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
|
chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE,
|
||||||
text_splitter: TextSplitter = None,
|
text_splitter: TextSplitter = None,
|
||||||
):
|
):
|
||||||
|
#add the title name on every paragraph, by weiweiwang 2025/1/13
|
||||||
def customize_zh_title_enhance(docs: Document) -> Document:
|
def customize_zh_title_enhance(docs: Document) -> Document:
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|
@ -375,6 +378,11 @@ class KnowledgeFile:
|
||||||
print("文件不存在")
|
print("文件不存在")
|
||||||
|
|
||||||
docs = docs or self.file2docs(refresh=refresh)
|
docs = docs or self.file2docs(refresh=refresh)
|
||||||
|
#remove the redundant line break after loading, by weiweiwang 2025/1/13
|
||||||
|
for doc in docs:
|
||||||
|
if doc.page_content.strip() != "":
|
||||||
|
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
return []
|
return []
|
||||||
if self.ext not in [".csv"]:
|
if self.ext not in [".csv"]:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue