增加显示ocr识别进度

This commit is contained in:
glide-the 2023-09-08 21:32:41 +08:00
parent 60e6887e94
commit b0a9d8f30e
2 changed files with 13 additions and 1 deletions

View File

@ -1,5 +1,6 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
import tqdm
class RapidOCRPDFLoader(UnstructuredFileLoader):
@ -11,7 +12,14 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
ocr = RapidOCR()
doc = fitz.open(filepath)
resp = ""
for page in doc:
b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
for i, page in enumerate(doc):
# 更新描述
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
# 立即显示进度条更新结果
b_unit.refresh()
# TODO: 依据文本与图片顺序调整处理方式
text = page.get_text("")
resp += text + "\n"
@ -24,6 +32,9 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
if result:
ocr_result = [line[1] for line in result]
resp += "\n".join(ocr_result)
# 更新进度
b_unit.update(1)
return resp
text = pdf2text(self.file_path)

View File

@ -33,3 +33,4 @@ streamlit-chatbox>=1.1.6
streamlit-aggrid>=0.3.4.post3
httpx~=0.24.1
watchdog
tqdm