增加构建文档metadata检索的开关

This commit is contained in:
glide-the 2023-09-05 18:35:11 +08:00
parent c26d66772a
commit 491c9605d9
1 changed files with 21 additions and 9 deletions

View File

@ -191,7 +191,7 @@ class KnowledgeFile:
# TODO: 增加依据文件格式匹配text_splitter # TODO: 增加依据文件格式匹配text_splitter
self.text_splitter_name = None self.text_splitter_name = None
def file2text(self, using_zh_title_enhance=ZH_TITLE_ENHANCE, refresh: bool = False): def file2text(self, using_zh_title_enhance=ZH_TITLE_ENHANCE, refresh: bool = False, build_meta_data: bool = True):
if self.docs is not None and not refresh: if self.docs is not None and not refresh:
return self.docs return self.docs
@ -250,6 +250,18 @@ class KnowledgeFile:
) )
docs = loader.load_and_split(text_splitter) docs = loader.load_and_split(text_splitter)
if build_meta_data:
meta_data = docs[0].metadata
# 对meta_data每项格式化成 "<key>":"<value>" 形式
meta_data = {f'"{k}":"{v}"' for k, v in meta_data.items()}
# 转换成字符串
meta_data = "<metadata>\r\n" + "\r\n\b".join(meta_data) + "\r\n</metadata>"
doc = Document(page_content=str(meta_data), metadata=docs[0].metadata)
# 将doc 添加到docs的第一项
docs.insert(0, doc)
print(docs[0]) print(docs[0])
if using_zh_title_enhance: if using_zh_title_enhance:
docs = zh_title_enhance(docs) docs = zh_title_enhance(docs)
@ -264,9 +276,9 @@ class KnowledgeFile:
def run_in_thread_pool( def run_in_thread_pool(
func: Callable, func: Callable,
params: List[Dict] = [], params: List[Dict] = [],
pool: ThreadPoolExecutor = None, pool: ThreadPoolExecutor = None,
) -> Generator: ) -> Generator:
''' '''
在线程池中批量运行任务并将运行结果以生成器的形式返回 在线程池中批量运行任务并将运行结果以生成器的形式返回
@ -285,8 +297,8 @@ def run_in_thread_pool(
def files2docs_in_thread( def files2docs_in_thread(
files: List[Union[KnowledgeFile, Tuple[str, str], Dict]], files: List[Union[KnowledgeFile, Tuple[str, str], Dict]],
pool: ThreadPoolExecutor = None, pool: ThreadPoolExecutor = None,
) -> Generator: ) -> Generator:
''' '''
利用多线程批量将文件转化成langchain Document. 利用多线程批量将文件转化成langchain Document.