ocr 支持 GPU 加速(需要手动安装 rapidocr_paddle[gpu]);知识库支持 MHTML 和 Evernote 文件。 (#2265)
在 requirements 和 Wiki 中增加对可选文档加载器 SDK 的说明 ( close #2264 )
This commit is contained in:
parent
7d2de47bcf
commit
67b7c99d03
|
|
@ -1,13 +1,13 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
from document_loaders.ocr import get_ocr
|
||||||
|
|
||||||
|
|
||||||
class RapidOCRLoader(UnstructuredFileLoader):
|
class RapidOCRLoader(UnstructuredFileLoader):
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
def img2text(filepath):
|
def img2text(filepath):
|
||||||
from rapidocr_onnxruntime import RapidOCR
|
|
||||||
resp = ""
|
resp = ""
|
||||||
ocr = RapidOCR()
|
ocr = get_ocr()
|
||||||
result, _ = ocr(filepath)
|
result, _ = ocr(filepath)
|
||||||
if result:
|
if result:
|
||||||
ocr_result = [line[1] for line in result]
|
ocr_result = [line[1] for line in result]
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
from document_loaders.ocr import get_ocr
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -7,9 +8,8 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
def pdf2text(filepath):
|
def pdf2text(filepath):
|
||||||
import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆
|
import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆
|
||||||
from rapidocr_onnxruntime import RapidOCR
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
ocr = RapidOCR()
|
ocr = get_ocr()
|
||||||
doc = fitz.open(filepath)
|
doc = fitz.open(filepath)
|
||||||
resp = ""
|
resp = ""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,18 @@
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
try:
|
||||||
|
from rapidocr_paddle import RapidOCR
|
||||||
|
except ImportError:
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
|
||||||
|
|
||||||
|
def get_ocr(use_cuda: bool = True) -> "RapidOCR":
|
||||||
|
try:
|
||||||
|
from rapidocr_paddle import RapidOCR
|
||||||
|
ocr = RapidOCR(det_use_cuda=use_cuda, cls_use_cuda=use_cuda, rec_use_cuda=use_cuda)
|
||||||
|
except ImportError:
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
ocr = RapidOCR()
|
||||||
|
return ocr
|
||||||
|
|
@ -1 +1 @@
|
||||||
Subproject commit f789e5dde10f91136012f3470c020c8d34572436
|
Subproject commit 9a3fa7a77f8748748b1c656fe8919ad5c4c63e3f
|
||||||
|
|
@ -38,6 +38,12 @@ einops>=0.7.0
|
||||||
transformers_stream_generator==0.0.4
|
transformers_stream_generator==0.0.4
|
||||||
vllm==0.2.2; sys_platform == "linux"
|
vllm==0.2.2; sys_platform == "linux"
|
||||||
|
|
||||||
|
# optional document loaders
|
||||||
|
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
|
||||||
|
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
|
||||||
|
# html2text # for .enex files
|
||||||
|
# beautifulsoup4 # for .mhtml files
|
||||||
|
# pysrt # for .srt files
|
||||||
|
|
||||||
# Online api libs dependencies
|
# Online api libs dependencies
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,13 @@ vllm==0.2.2; sys_platform == "linux"
|
||||||
httpx[brotli,http2,socks]>=0.25.2
|
httpx[brotli,http2,socks]>=0.25.2
|
||||||
|
|
||||||
|
|
||||||
|
# optional document loaders
|
||||||
|
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
|
||||||
|
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
|
||||||
|
# html2text # for .enex files
|
||||||
|
# beautifulsoup4 # for .mhtml files
|
||||||
|
# pysrt # for .srt files
|
||||||
|
|
||||||
# Online api libs dependencies
|
# Online api libs dependencies
|
||||||
|
|
||||||
# zhipuai>=1.0.7
|
# zhipuai>=1.0.7
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,13 @@ faiss-cpu
|
||||||
# PyMuPDF==1.22.5 # install if need pdf
|
# PyMuPDF==1.22.5 # install if need pdf
|
||||||
# rapidocr_onnxruntime>=1.3.2 # install if need pdf
|
# rapidocr_onnxruntime>=1.3.2 # install if need pdf
|
||||||
|
|
||||||
|
# optional document loaders
|
||||||
|
# rapidocr_paddle[gpu] # gpu accelleration for ocr of pdf and image files
|
||||||
|
# jq # for .json and .jsonl files. suggest `conda install jq` on windows
|
||||||
|
# html2text # for .enex files
|
||||||
|
# beautifulsoup4 # for .mhtml files
|
||||||
|
# pysrt # for .srt files
|
||||||
|
|
||||||
requests
|
requests
|
||||||
pathlib
|
pathlib
|
||||||
pytest
|
pytest
|
||||||
|
|
|
||||||
|
|
@ -85,6 +85,7 @@ def list_files_from_folder(kb_name: str):
|
||||||
|
|
||||||
|
|
||||||
LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
||||||
|
"MHTMLLoader": ['.mhtml'],
|
||||||
"UnstructuredMarkdownLoader": ['.md'],
|
"UnstructuredMarkdownLoader": ['.md'],
|
||||||
"JSONLoader": [".json"],
|
"JSONLoader": [".json"],
|
||||||
"JSONLinesLoader": [".jsonl"],
|
"JSONLinesLoader": [".jsonl"],
|
||||||
|
|
@ -106,6 +107,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
||||||
"UnstructuredWordDocumentLoader": ['.docx', 'doc'],
|
"UnstructuredWordDocumentLoader": ['.docx', 'doc'],
|
||||||
"UnstructuredXMLLoader": ['.xml'],
|
"UnstructuredXMLLoader": ['.xml'],
|
||||||
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
|
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
|
||||||
|
"EverNoteLoader": ['.enex'],
|
||||||
"UnstructuredFileLoader": ['.txt'],
|
"UnstructuredFileLoader": ['.txt'],
|
||||||
}
|
}
|
||||||
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
|
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue