增加显示ocr识别进度
This commit is contained in:
parent
60e6887e94
commit
b0a9d8f30e
|
|
@ -1,5 +1,6 @@
|
|||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
import tqdm
|
||||
|
||||
|
||||
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||
|
|
@ -11,7 +12,14 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
|||
ocr = RapidOCR()
|
||||
doc = fitz.open(filepath)
|
||||
resp = ""
|
||||
for page in doc:
|
||||
|
||||
b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
|
||||
for i, page in enumerate(doc):
|
||||
|
||||
# 更新描述
|
||||
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
|
||||
# 立即显示进度条更新结果
|
||||
b_unit.refresh()
|
||||
# TODO: 依据文本与图片顺序调整处理方式
|
||||
text = page.get_text("")
|
||||
resp += text + "\n"
|
||||
|
|
@ -24,6 +32,9 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
|||
if result:
|
||||
ocr_result = [line[1] for line in result]
|
||||
resp += "\n".join(ocr_result)
|
||||
|
||||
# 更新进度
|
||||
b_unit.update(1)
|
||||
return resp
|
||||
|
||||
text = pdf2text(self.file_path)
|
||||
|
|
|
|||
|
|
@ -33,3 +33,4 @@ streamlit-chatbox>=1.1.6
|
|||
streamlit-aggrid>=0.3.4.post3
|
||||
httpx~=0.24.1
|
||||
watchdog
|
||||
tqdm
|
||||
|
|
|
|||
Loading…
Reference in New Issue