ocr 支持 GPU 加速(需要手动安装 rapidocr_paddle[gpu]);知识库支持 MHTML 和 Evernote 文件。 (#2265)
在 requirements 和 Wiki 中增加对可选文档加载器 SDK 的说明 ( close #2264 )
This commit is contained in:
parent
7d2de47bcf
commit
67b7c99d03
|
|
@ -1,13 +1,13 @@
|
|||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from document_loaders.ocr import get_ocr
|
||||
|
||||
|
||||
class RapidOCRLoader(UnstructuredFileLoader):
|
||||
def _get_elements(self) -> List:
|
||||
def img2text(filepath):
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
resp = ""
|
||||
ocr = RapidOCR()
|
||||
ocr = get_ocr()
|
||||
result, _ = ocr(filepath)
|
||||
if result:
|
||||
ocr_result = [line[1] for line in result]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from document_loaders.ocr import get_ocr
|
||||
import tqdm
|
||||
|
||||
|
||||
|
|
@ -7,9 +8,8 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
|||
def _get_elements(self) -> List:
|
||||
def pdf2text(filepath):
|
||||
import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
import numpy as np
|
||||
ocr = RapidOCR()
|
||||
ocr = get_ocr()
|
||||
doc = fitz.open(filepath)
|
||||
resp = ""
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,18 @@
|
|||
from typing import TYPE_CHECKING
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
try:
|
||||
from rapidocr_paddle import RapidOCR
|
||||
except ImportError:
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
|
||||
|
||||
def get_ocr(use_cuda: bool = True) -> "RapidOCR":
|
||||
try:
|
||||
from rapidocr_paddle import RapidOCR
|
||||
ocr = RapidOCR(det_use_cuda=use_cuda, cls_use_cuda=use_cuda, rec_use_cuda=use_cuda)
|
||||
except ImportError:
|
||||
from rapidocr_onnxruntime import RapidOCR
|
||||
ocr = RapidOCR()
|
||||
return ocr
|
||||
|
|
@ -1 +1 @@
|
|||
Subproject commit f789e5dde10f91136012f3470c020c8d34572436
|
||||
Subproject commit 9a3fa7a77f8748748b1c656fe8919ad5c4c63e3f
|
||||
|
|
@ -38,6 +38,12 @@ einops>=0.7.0
|
|||
transformers_stream_generator==0.0.4
|
||||
vllm==0.2.2; sys_platform == "linux"
|
||||
|
||||
# optional document loaders
|
||||
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
|
||||
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
|
||||
# html2text # for .enex files
|
||||
# beautifulsoup4 # for .mhtml files
|
||||
# pysrt # for .srt files
|
||||
|
||||
# Online api libs dependencies
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,13 @@ vllm==0.2.2; sys_platform == "linux"
|
|||
httpx[brotli,http2,socks]>=0.25.2
|
||||
|
||||
|
||||
# optional document loaders
|
||||
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
|
||||
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
|
||||
# html2text # for .enex files
|
||||
# beautifulsoup4 # for .mhtml files
|
||||
# pysrt # for .srt files
|
||||
|
||||
# Online api libs dependencies
|
||||
|
||||
# zhipuai>=1.0.7
|
||||
|
|
|
|||
|
|
@ -19,6 +19,13 @@ faiss-cpu
|
|||
# PyMuPDF==1.22.5 # install if need pdf
|
||||
# rapidocr_onnxruntime>=1.3.2 # install if need pdf
|
||||
|
||||
# optional document loaders
|
||||
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
|
||||
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
|
||||
# html2text # for .enex files
|
||||
# beautifulsoup4 # for .mhtml files
|
||||
# pysrt # for .srt files
|
||||
|
||||
requests
|
||||
pathlib
|
||||
pytest
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ def list_files_from_folder(kb_name: str):
|
|||
|
||||
|
||||
LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
||||
"MHTMLLoader": ['.mhtml'],
|
||||
"UnstructuredMarkdownLoader": ['.md'],
|
||||
"JSONLoader": [".json"],
|
||||
"JSONLinesLoader": [".jsonl"],
|
||||
|
|
@ -106,6 +107,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
|||
"UnstructuredWordDocumentLoader": ['.docx', 'doc'],
|
||||
"UnstructuredXMLLoader": ['.xml'],
|
||||
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
|
||||
"EverNoteLoader": ['.enex'],
|
||||
"UnstructuredFileLoader": ['.txt'],
|
||||
}
|
||||
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
|
||||
|
|
|
|||
Loading…
Reference in New Issue