From b0a9d8f30e7ef4aa2cf92d540663f86e9f7e2c21 Mon Sep 17 00:00:00 2001 From: glide-the <2533736852@qq.com> Date: Fri, 8 Sep 2023 21:32:41 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=98=BE=E7=A4=BAocr?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E8=BF=9B=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- document_loaders/mypdfloader.py | 13 ++++++++++++- requirements.txt | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/document_loaders/mypdfloader.py b/document_loaders/mypdfloader.py index 71e063d..c423643 100644 --- a/document_loaders/mypdfloader.py +++ b/document_loaders/mypdfloader.py @@ -1,5 +1,6 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader +import tqdm class RapidOCRPDFLoader(UnstructuredFileLoader): @@ -11,7 +12,14 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): ocr = RapidOCR() doc = fitz.open(filepath) resp = "" - for page in doc: + + b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") + for i, page in enumerate(doc): + + # 更新描述 + b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) + # 立即显示进度条更新结果 + b_unit.refresh() # TODO: 依据文本与图片顺序调整处理方式 text = page.get_text("") resp += text + "\n" @@ -24,6 +32,9 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): if result: ocr_result = [line[1] for line in result] resp += "\n".join(ocr_result) + + # 更新进度 + b_unit.update(1) return resp text = pdf2text(self.file_path) diff --git a/requirements.txt b/requirements.txt index 910a9ed..7897232 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,4 @@ streamlit-chatbox>=1.1.6 streamlit-aggrid>=0.3.4.post3 httpx~=0.24.1 watchdog +tqdm