update pdf_loader.py
This commit is contained in:
parent
e8a37ff4c7
commit
6d1523728b
|
|
@ -15,13 +15,11 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||||
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
|
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
|
||||||
if not os.path.exists(full_dir_path):
|
if not os.path.exists(full_dir_path):
|
||||||
os.makedirs(full_dir_path)
|
os.makedirs(full_dir_path)
|
||||||
filename = os.path.split(filepath)[-1]
|
|
||||||
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
||||||
doc = fitz.open(filepath)
|
doc = fitz.open(filepath)
|
||||||
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
|
txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt")
|
||||||
img_name = os.path.join(full_dir_path, 'tmp.png')
|
img_name = os.path.join(full_dir_path, 'tmp.png')
|
||||||
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
||||||
|
|
||||||
for i in range(doc.page_count):
|
for i in range(doc.page_count):
|
||||||
page = doc[i]
|
page = doc[i]
|
||||||
text = page.get_text("")
|
text = page.get_text("")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue