add tree func for reading files in dirs (#471)
This commit is contained in:
parent
4295f6069d
commit
f7e120fe56
|
|
@ -20,6 +20,29 @@ from agent import bing_search
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
|
||||||
|
"""返回两个列表,第一个列表为 filepath 下全部文件的完整路径, 第二个为对应的文件名"""
|
||||||
|
if ignore_dir_names is None:
|
||||||
|
ignore_dir_names = []
|
||||||
|
if ignore_file_names is None:
|
||||||
|
ignore_file_names = []
|
||||||
|
ret_list = []
|
||||||
|
if isinstance(filepath, str):
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
print("路径不存在")
|
||||||
|
return None, None
|
||||||
|
elif os.path.isfile(filepath) and os.path.basename(filepath) not in ignore_file_names:
|
||||||
|
return [filepath], [os.path.basename(filepath)]
|
||||||
|
elif os.path.isdir(filepath) and os.path.basename(filepath) not in ignore_dir_names:
|
||||||
|
for file in os.listdir(filepath):
|
||||||
|
fullfilepath = os.path.join(filepath, file)
|
||||||
|
if os.path.isfile(fullfilepath) and os.path.basename(fullfilepath) not in ignore_file_names:
|
||||||
|
ret_list.append(fullfilepath)
|
||||||
|
if os.path.isdir(fullfilepath) and os.path.basename(fullfilepath) not in ignore_dir_names:
|
||||||
|
ret_list.extend(tree(fullfilepath, ignore_dir_names, ignore_file_names)[0])
|
||||||
|
return ret_list, [os.path.basename(p) for p in ret_list]
|
||||||
|
|
||||||
|
|
||||||
def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
||||||
if filepath.lower().endswith(".md"):
|
if filepath.lower().endswith(".md"):
|
||||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||||
|
|
@ -189,8 +212,7 @@ class LocalDocQA:
|
||||||
return None
|
return None
|
||||||
elif os.path.isdir(filepath):
|
elif os.path.isdir(filepath):
|
||||||
docs = []
|
docs = []
|
||||||
for file in tqdm(os.listdir(filepath), desc="加载文件"):
|
for fullfilepath, file in tqdm(zip(*tree(filepath, ignore_dir_names=['tmp_files'])), desc="加载文件"):
|
||||||
fullfilepath = os.path.join(filepath, file)
|
|
||||||
try:
|
try:
|
||||||
docs += load_file(fullfilepath, sentence_size)
|
docs += load_file(fullfilepath, sentence_size)
|
||||||
loaded_files.append(fullfilepath)
|
loaded_files.append(fullfilepath)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue