增加显示ocr识别进度
This commit is contained in:
parent
60e6887e94
commit
b0a9d8f30e
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
|
|
@ -11,7 +12,14 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
ocr = RapidOCR()
|
ocr = RapidOCR()
|
||||||
doc = fitz.open(filepath)
|
doc = fitz.open(filepath)
|
||||||
resp = ""
|
resp = ""
|
||||||
for page in doc:
|
|
||||||
|
b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
|
||||||
|
for i, page in enumerate(doc):
|
||||||
|
|
||||||
|
# 更新描述
|
||||||
|
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
|
||||||
|
# 立即显示进度条更新结果
|
||||||
|
b_unit.refresh()
|
||||||
# TODO: 依据文本与图片顺序调整处理方式
|
# TODO: 依据文本与图片顺序调整处理方式
|
||||||
text = page.get_text("")
|
text = page.get_text("")
|
||||||
resp += text + "\n"
|
resp += text + "\n"
|
||||||
|
|
@ -24,6 +32,9 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
if result:
|
if result:
|
||||||
ocr_result = [line[1] for line in result]
|
ocr_result = [line[1] for line in result]
|
||||||
resp += "\n".join(ocr_result)
|
resp += "\n".join(ocr_result)
|
||||||
|
|
||||||
|
# 更新进度
|
||||||
|
b_unit.update(1)
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
text = pdf2text(self.file_path)
|
text = pdf2text(self.file_path)
|
||||||
|
|
|
||||||
|
|
@ -33,3 +33,4 @@ streamlit-chatbox>=1.1.6
|
||||||
streamlit-aggrid>=0.3.4.post3
|
streamlit-aggrid>=0.3.4.post3
|
||||||
httpx~=0.24.1
|
httpx~=0.24.1
|
||||||
watchdog
|
watchdog
|
||||||
|
tqdm
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue