add tree func for reading files in dirs (#471)
This commit is contained in:
parent
4295f6069d
commit
f7e120fe56
|
|
@ -20,6 +20,29 @@ from agent import bing_search
|
|||
from langchain.docstore.document import Document
|
||||
|
||||
|
||||
def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
|
||||
"""返回两个列表,第一个列表为 filepath 下全部文件的完整路径, 第二个为对应的文件名"""
|
||||
if ignore_dir_names is None:
|
||||
ignore_dir_names = []
|
||||
if ignore_file_names is None:
|
||||
ignore_file_names = []
|
||||
ret_list = []
|
||||
if isinstance(filepath, str):
|
||||
if not os.path.exists(filepath):
|
||||
print("路径不存在")
|
||||
return None, None
|
||||
elif os.path.isfile(filepath) and os.path.basename(filepath) not in ignore_file_names:
|
||||
return [filepath], [os.path.basename(filepath)]
|
||||
elif os.path.isdir(filepath) and os.path.basename(filepath) not in ignore_dir_names:
|
||||
for file in os.listdir(filepath):
|
||||
fullfilepath = os.path.join(filepath, file)
|
||||
if os.path.isfile(fullfilepath) and os.path.basename(fullfilepath) not in ignore_file_names:
|
||||
ret_list.append(fullfilepath)
|
||||
if os.path.isdir(fullfilepath) and os.path.basename(fullfilepath) not in ignore_dir_names:
|
||||
ret_list.extend(tree(fullfilepath, ignore_dir_names, ignore_file_names)[0])
|
||||
return ret_list, [os.path.basename(p) for p in ret_list]
|
||||
|
||||
|
||||
def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
||||
if filepath.lower().endswith(".md"):
|
||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||
|
|
@ -189,8 +212,7 @@ class LocalDocQA:
|
|||
return None
|
||||
elif os.path.isdir(filepath):
|
||||
docs = []
|
||||
for file in tqdm(os.listdir(filepath), desc="加载文件"):
|
||||
fullfilepath = os.path.join(filepath, file)
|
||||
for fullfilepath, file in tqdm(zip(*tree(filepath, ignore_dir_names=['tmp_files'])), desc="加载文件"):
|
||||
try:
|
||||
docs += load_file(fullfilepath, sentence_size)
|
||||
loaded_files.append(fullfilepath)
|
||||
|
|
|
|||
Loading…
Reference in New Issue