add tree func for reading files in dirs (#471)

This commit is contained in:
Ding Junyao 2023-05-27 22:00:43 +08:00 committed by GitHub
parent 4295f6069d
commit f7e120fe56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 24 additions and 2 deletions

View File

@ -20,6 +20,29 @@ from agent import bing_search
from langchain.docstore.document import Document
def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
"""返回两个列表,第一个列表为 filepath 下全部文件的完整路径, 第二个为对应的文件名"""
if ignore_dir_names is None:
ignore_dir_names = []
if ignore_file_names is None:
ignore_file_names = []
ret_list = []
if isinstance(filepath, str):
if not os.path.exists(filepath):
print("路径不存在")
return None, None
elif os.path.isfile(filepath) and os.path.basename(filepath) not in ignore_file_names:
return [filepath], [os.path.basename(filepath)]
elif os.path.isdir(filepath) and os.path.basename(filepath) not in ignore_dir_names:
for file in os.listdir(filepath):
fullfilepath = os.path.join(filepath, file)
if os.path.isfile(fullfilepath) and os.path.basename(fullfilepath) not in ignore_file_names:
ret_list.append(fullfilepath)
if os.path.isdir(fullfilepath) and os.path.basename(fullfilepath) not in ignore_dir_names:
ret_list.extend(tree(fullfilepath, ignore_dir_names, ignore_file_names)[0])
return ret_list, [os.path.basename(p) for p in ret_list]
def load_file(filepath, sentence_size=SENTENCE_SIZE):
if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements")
@ -189,8 +212,7 @@ class LocalDocQA:
return None
elif os.path.isdir(filepath):
docs = []
for file in tqdm(os.listdir(filepath), desc="加载文件"):
fullfilepath = os.path.join(filepath, file)
for fullfilepath, file in tqdm(zip(*tree(filepath, ignore_dir_names=['tmp_files'])), desc="加载文件"):
try:
docs += load_file(fullfilepath, sentence_size)
loaded_files.append(fullfilepath)