update loader.py
This commit is contained in:
parent
d5ffdaa281
commit
e8a37ff4c7
|
|
@ -26,6 +26,9 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
||||||
if filepath.lower().endswith(".md"):
|
if filepath.lower().endswith(".md"):
|
||||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
elif filepath.lower().endswith(".txt"):
|
||||||
|
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||||
|
docs = loader.load()
|
||||||
elif filepath.lower().endswith(".pdf"):
|
elif filepath.lower().endswith(".pdf"):
|
||||||
loader = UnstructuredPaddlePDFLoader(filepath)
|
loader = UnstructuredPaddlePDFLoader(filepath)
|
||||||
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
|
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
|
||||||
|
|
@ -47,7 +50,7 @@ def write_check_file(filepath, docs):
|
||||||
if not os.path.exists(folder_path):
|
if not os.path.exists(folder_path):
|
||||||
os.makedirs(folder_path)
|
os.makedirs(folder_path)
|
||||||
fp = os.path.join(folder_path, 'load_file.txt')
|
fp = os.path.join(folder_path, 'load_file.txt')
|
||||||
fout = open(fp, 'a')
|
with open(fp, 'a+', encoding='utf-8') as fout:
|
||||||
fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
|
fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
|
||||||
fout.write('\n')
|
fout.write('\n')
|
||||||
for i in docs:
|
for i in docs:
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||||
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
||||||
doc = fitz.open(filepath)
|
doc = fitz.open(filepath)
|
||||||
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
|
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
|
||||||
img_name = os.path.join(full_dir_path, '.tmp.png')
|
img_name = os.path.join(full_dir_path, 'tmp.png')
|
||||||
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
||||||
|
|
||||||
for i in range(doc.page_count):
|
for i in range(doc.page_count):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue