From f7e120fe56f1e0aa5ae47b78541c89a7353032b1 Mon Sep 17 00:00:00 2001 From: Ding Junyao <11372753+DingJunyao@users.noreply.github.com> Date: Sat, 27 May 2023 22:00:43 +0800 Subject: [PATCH] add tree func for reading files in dirs (#471) --- chains/local_doc_qa.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py index 7bde8a0..bd7971f 100644 --- a/chains/local_doc_qa.py +++ b/chains/local_doc_qa.py @@ -20,6 +20,29 @@ from agent import bing_search from langchain.docstore.document import Document +def tree(filepath, ignore_dir_names=None, ignore_file_names=None): + """返回两个列表,第一个列表为 filepath 下全部文件的完整路径, 第二个为对应的文件名""" + if ignore_dir_names is None: + ignore_dir_names = [] + if ignore_file_names is None: + ignore_file_names = [] + ret_list = [] + if isinstance(filepath, str): + if not os.path.exists(filepath): + print("路径不存在") + return None, None + elif os.path.isfile(filepath) and os.path.basename(filepath) not in ignore_file_names: + return [filepath], [os.path.basename(filepath)] + elif os.path.isdir(filepath) and os.path.basename(filepath) not in ignore_dir_names: + for file in os.listdir(filepath): + fullfilepath = os.path.join(filepath, file) + if os.path.isfile(fullfilepath) and os.path.basename(fullfilepath) not in ignore_file_names: + ret_list.append(fullfilepath) + if os.path.isdir(fullfilepath) and os.path.basename(fullfilepath) not in ignore_dir_names: + ret_list.extend(tree(fullfilepath, ignore_dir_names, ignore_file_names)[0]) + return ret_list, [os.path.basename(p) for p in ret_list] + + def load_file(filepath, sentence_size=SENTENCE_SIZE): if filepath.lower().endswith(".md"): loader = UnstructuredFileLoader(filepath, mode="elements") @@ -189,8 +212,7 @@ class LocalDocQA: return None elif os.path.isdir(filepath): docs = [] - for file in tqdm(os.listdir(filepath), desc="加载文件"): - fullfilepath = os.path.join(filepath, file) + for fullfilepath, file in tqdm(zip(*tree(filepath, ignore_dir_names=['tmp_files'])), desc="加载文件"): try: docs += load_file(fullfilepath, sentence_size) loaded_files.append(fullfilepath)