ocr 支持 GPU 加速(需要手动安装 rapidocr_paddle[gpu]);知识库支持 MHTML 和 Evernote 文件。 (#2265)

在 requirements 和 Wiki 中增加对可选文档加载器 SDK 的说明 ( close #2264 )
This commit is contained in:
liunux4odoo 2023-12-04 09:39:56 +08:00 committed by GitHub
parent 7d2de47bcf
commit 67b7c99d03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 45 additions and 5 deletions

View File

@ -1,13 +1,13 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from document_loaders.ocr import get_ocr
class RapidOCRLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def img2text(filepath):
from rapidocr_onnxruntime import RapidOCR
resp = ""
ocr = RapidOCR()
ocr = get_ocr()
result, _ = ocr(filepath)
if result:
ocr_result = [line[1] for line in result]

View File

@ -1,5 +1,6 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from document_loaders.ocr import get_ocr
import tqdm
@ -7,9 +8,8 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def pdf2text(filepath):
import fitz # pyMuPDF里面的fitz包不要与pip install fitz混淆
from rapidocr_onnxruntime import RapidOCR
import numpy as np
ocr = RapidOCR()
ocr = get_ocr()
doc = fitz.open(filepath)
resp = ""

18
document_loaders/ocr.py Normal file
View File

@ -0,0 +1,18 @@
from typing import TYPE_CHECKING
if TYPE_CHECKING:
try:
from rapidocr_paddle import RapidOCR
except ImportError:
from rapidocr_onnxruntime import RapidOCR
def get_ocr(use_cuda: bool = True) -> "RapidOCR":
try:
from rapidocr_paddle import RapidOCR
ocr = RapidOCR(det_use_cuda=use_cuda, cls_use_cuda=use_cuda, rec_use_cuda=use_cuda)
except ImportError:
from rapidocr_onnxruntime import RapidOCR
ocr = RapidOCR()
return ocr

@ -1 +1 @@
Subproject commit f789e5dde10f91136012f3470c020c8d34572436
Subproject commit 9a3fa7a77f8748748b1c656fe8919ad5c4c63e3f

View File

@ -38,6 +38,12 @@ einops>=0.7.0
transformers_stream_generator==0.0.4
vllm==0.2.2; sys_platform == "linux"
# optional document loaders
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
# html2text # for .enex files
# beautifulsoup4 # for .mhtml files
# pysrt # for .srt files
# Online api libs dependencies

View File

@ -40,6 +40,13 @@ vllm==0.2.2; sys_platform == "linux"
httpx[brotli,http2,socks]>=0.25.2
# optional document loaders
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
# html2text # for .enex files
# beautifulsoup4 # for .mhtml files
# pysrt # for .srt files
# Online api libs dependencies
# zhipuai>=1.0.7

View File

@ -19,6 +19,13 @@ faiss-cpu
# PyMuPDF==1.22.5 # install if need pdf
# rapidocr_onnxruntime>=1.3.2 # install if need pdf
# optional document loaders
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
# html2text # for .enex files
# beautifulsoup4 # for .mhtml files
# pysrt # for .srt files
requests
pathlib
pytest

View File

@ -85,6 +85,7 @@ def list_files_from_folder(kb_name: str):
LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
"MHTMLLoader": ['.mhtml'],
"UnstructuredMarkdownLoader": ['.md'],
"JSONLoader": [".json"],
"JSONLinesLoader": [".jsonl"],
@ -106,6 +107,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
"UnstructuredWordDocumentLoader": ['.docx', 'doc'],
"UnstructuredXMLLoader": ['.xml'],
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
"EverNoteLoader": ['.enex'],
"UnstructuredFileLoader": ['.txt'],
}
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]