From 67b7c99d0312d9644a3d4dcb99ae58e1c46221ac Mon Sep 17 00:00:00 2001 From: liunux4odoo <41217877+liunux4odoo@users.noreply.github.com> Date: Mon, 4 Dec 2023 09:39:56 +0800 Subject: [PATCH] =?UTF-8?q?ocr=20=E6=94=AF=E6=8C=81=20GPU=20=E5=8A=A0?= =?UTF-8?q?=E9=80=9F=EF=BC=88=E9=9C=80=E8=A6=81=E6=89=8B=E5=8A=A8=E5=AE=89?= =?UTF-8?q?=E8=A3=85=20rapidocr=5Fpaddle[gpu])=EF=BC=9B=E7=9F=A5=E8=AF=86?= =?UTF-8?q?=E5=BA=93=E6=94=AF=E6=8C=81=20MHTML=20=E5=92=8C=20Evernote=20?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E3=80=82=20(#2265)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在 requirements 和 Wiki 中增加对可选文档加载器 SDK 的说明 ( close #2264 ) --- document_loaders/myimgloader.py | 4 ++-- document_loaders/mypdfloader.py | 4 ++-- document_loaders/ocr.py | 18 ++++++++++++++++++ knowledge_base/samples/content/wiki | 2 +- requirements.txt | 6 ++++++ requirements_api.txt | 7 +++++++ requirements_lite.txt | 7 +++++++ server/knowledge_base/utils.py | 2 ++ 8 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 document_loaders/ocr.py diff --git a/document_loaders/myimgloader.py b/document_loaders/myimgloader.py index 8648192..e09c617 100644 --- a/document_loaders/myimgloader.py +++ b/document_loaders/myimgloader.py @@ -1,13 +1,13 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader +from document_loaders.ocr import get_ocr class RapidOCRLoader(UnstructuredFileLoader): def _get_elements(self) -> List: def img2text(filepath): - from rapidocr_onnxruntime import RapidOCR resp = "" - ocr = RapidOCR() + ocr = get_ocr() result, _ = ocr(filepath) if result: ocr_result = [line[1] for line in result] diff --git a/document_loaders/mypdfloader.py b/document_loaders/mypdfloader.py index 6cb7726..51778b8 100644 --- a/document_loaders/mypdfloader.py +++ b/document_loaders/mypdfloader.py @@ -1,5 +1,6 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader +from document_loaders.ocr import get_ocr import tqdm @@ -7,9 +8,8 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: def pdf2text(filepath): import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆 - from rapidocr_onnxruntime import RapidOCR import numpy as np - ocr = RapidOCR() + ocr = get_ocr() doc = fitz.open(filepath) resp = "" diff --git a/document_loaders/ocr.py b/document_loaders/ocr.py new file mode 100644 index 0000000..2b66dd3 --- /dev/null +++ b/document_loaders/ocr.py @@ -0,0 +1,18 @@ +from typing import TYPE_CHECKING + + +if TYPE_CHECKING: + try: + from rapidocr_paddle import RapidOCR + except ImportError: + from rapidocr_onnxruntime import RapidOCR + + +def get_ocr(use_cuda: bool = True) -> "RapidOCR": + try: + from rapidocr_paddle import RapidOCR + ocr = RapidOCR(det_use_cuda=use_cuda, cls_use_cuda=use_cuda, rec_use_cuda=use_cuda) + except ImportError: + from rapidocr_onnxruntime import RapidOCR + ocr = RapidOCR() + return ocr diff --git a/knowledge_base/samples/content/wiki b/knowledge_base/samples/content/wiki index f789e5d..9a3fa7a 160000 --- a/knowledge_base/samples/content/wiki +++ b/knowledge_base/samples/content/wiki @@ -1 +1 @@ -Subproject commit f789e5dde10f91136012f3470c020c8d34572436 +Subproject commit 9a3fa7a77f8748748b1c656fe8919ad5c4c63e3f diff --git a/requirements.txt b/requirements.txt index 6d6161a..399521b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,6 +38,12 @@ einops>=0.7.0 transformers_stream_generator==0.0.4 vllm==0.2.2; sys_platform == "linux" +# optional document loaders +# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files +# jq # for .json and .jsonl files. suggest `conda install jq` on windows +# html2text # for .enex files +# beautifulsoup4 # for .mhtml files +# pysrt # for .srt files # Online api libs dependencies diff --git a/requirements_api.txt b/requirements_api.txt index 4ad926b..ec1005f 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -40,6 +40,13 @@ vllm==0.2.2; sys_platform == "linux" httpx[brotli,http2,socks]>=0.25.2 +# optional document loaders +# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files +# jq # for .json and .jsonl files. suggest `conda install jq` on windows +# html2text # for .enex files +# beautifulsoup4 # for .mhtml files +# pysrt # for .srt files + # Online api libs dependencies # zhipuai>=1.0.7 diff --git a/requirements_lite.txt b/requirements_lite.txt index 664c959..ad01376 100644 --- a/requirements_lite.txt +++ b/requirements_lite.txt @@ -19,6 +19,13 @@ faiss-cpu # PyMuPDF==1.22.5 # install if need pdf # rapidocr_onnxruntime>=1.3.2 # install if need pdf +# optional document loaders +# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files +# jq # for .json and .jsonl files. suggest `conda install jq` on windows +# html2text # for .enex files +# beautifulsoup4 # for .mhtml files +# pysrt # for .srt files + requests pathlib pytest diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index 38565fa..41c3ad5 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -85,6 +85,7 @@ def list_files_from_folder(kb_name: str): LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], + "MHTMLLoader": ['.mhtml'], "UnstructuredMarkdownLoader": ['.md'], "JSONLoader": [".json"], "JSONLinesLoader": [".jsonl"], @@ -106,6 +107,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], "UnstructuredWordDocumentLoader": ['.docx', 'doc'], "UnstructuredXMLLoader": ['.xml'], "UnstructuredPowerPointLoader": ['.ppt', '.pptx'], + "EverNoteLoader": ['.enex'], "UnstructuredFileLoader": ['.txt'], } SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]