diff --git a/document_loaders/mypdfloader.py b/document_loaders/mypdfloader.py index 71e063d..c423643 100644 --- a/document_loaders/mypdfloader.py +++ b/document_loaders/mypdfloader.py @@ -1,5 +1,6 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader +import tqdm class RapidOCRPDFLoader(UnstructuredFileLoader): @@ -11,7 +12,14 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): ocr = RapidOCR() doc = fitz.open(filepath) resp = "" - for page in doc: + + b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") + for i, page in enumerate(doc): + + # 更新描述 + b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) + # 立即显示进度条更新结果 + b_unit.refresh() # TODO: 依据文本与图片顺序调整处理方式 text = page.get_text("") resp += text + "\n" @@ -24,6 +32,9 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): if result: ocr_result = [line[1] for line in result] resp += "\n".join(ocr_result) + + # 更新进度 + b_unit.update(1) return resp text = pdf2text(self.file_path) diff --git a/requirements.txt b/requirements.txt index 910a9ed..7897232 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,4 @@ streamlit-chatbox>=1.1.6 streamlit-aggrid>=0.3.4.post3 httpx~=0.24.1 watchdog +tqdm