Merge branch 'chatchat-space:dev' into dev
This commit is contained in:
commit
427646ff36
|
|
@ -0,0 +1,2 @@
|
||||||
|
from .mypdfloader import RapidOCRPDFLoader
|
||||||
|
from .myimgloader import RapidOCRLoader
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
from typing import List
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
|
class RapidOCRLoader(UnstructuredFileLoader):
|
||||||
|
def _get_elements(self) -> List:
|
||||||
|
def img2text(filepath):
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
resp = ""
|
||||||
|
ocr = RapidOCR()
|
||||||
|
result, _ = ocr(filepath)
|
||||||
|
if result:
|
||||||
|
ocr_result = [line[1] for line in result]
|
||||||
|
resp += "\n".join(ocr_result)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
text = img2text(self.file_path)
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
return partition_text(text=text, **self.unstructured_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg")
|
||||||
|
docs = loader.load()
|
||||||
|
print(docs)
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
from typing import List
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
|
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
|
def _get_elements(self) -> List:
|
||||||
|
def pdf2text(filepath):
|
||||||
|
import fitz
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
import numpy as np
|
||||||
|
ocr = RapidOCR()
|
||||||
|
doc = fitz.open(filepath)
|
||||||
|
resp = ""
|
||||||
|
for page in doc:
|
||||||
|
# TODO: 依据文本与图片顺序调整处理方式
|
||||||
|
text = page.get_text("")
|
||||||
|
resp += text + "\n"
|
||||||
|
|
||||||
|
img_list = page.get_images()
|
||||||
|
for img in img_list:
|
||||||
|
pix = fitz.Pixmap(doc, img[0])
|
||||||
|
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
|
||||||
|
result, _ = ocr(img_array)
|
||||||
|
if result:
|
||||||
|
ocr_result = [line[1] for line in result]
|
||||||
|
resp += "\n".join(ocr_result)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
text = pdf2text(self.file_path)
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
return partition_text(text=text, **self.unstructured_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
loader = RapidOCRPDFLoader(file_path="../tests/samples/ocr_test.pdf")
|
||||||
|
docs = loader.load()
|
||||||
|
print(docs)
|
||||||
|
|
@ -15,6 +15,8 @@ SQLAlchemy==2.0.19
|
||||||
faiss-cpu
|
faiss-cpu
|
||||||
accelerate
|
accelerate
|
||||||
spacy
|
spacy
|
||||||
|
PyMuPDF==1.22.5
|
||||||
|
rapidocr_onnxruntime>=1.3.1
|
||||||
|
|
||||||
# uncomment libs if you want to use corresponding vector store
|
# uncomment libs if you want to use corresponding vector store
|
||||||
# pymilvus==2.1.3 # requires milvus==2.1.3
|
# pymilvus==2.1.3 # requires milvus==2.1.3
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ faiss-cpu
|
||||||
nltk
|
nltk
|
||||||
accelerate
|
accelerate
|
||||||
spacy
|
spacy
|
||||||
|
PyMuPDF==1.22.5
|
||||||
|
rapidocr_onnxruntime>=1.3.1
|
||||||
|
|
||||||
# uncomment libs if you want to use corresponding vector store
|
# uncomment libs if you want to use corresponding vector store
|
||||||
# pymilvus==2.1.3 # requires milvus==2.1.3
|
# pymilvus==2.1.3 # requires milvus==2.1.3
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,8 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
||||||
"UnstructuredMarkdownLoader": ['.md'],
|
"UnstructuredMarkdownLoader": ['.md'],
|
||||||
"CustomJSONLoader": [".json"],
|
"CustomJSONLoader": [".json"],
|
||||||
"CSVLoader": [".csv"],
|
"CSVLoader": [".csv"],
|
||||||
"PyPDFLoader": [".pdf"],
|
"RapidOCRPDFLoader": [".pdf"],
|
||||||
|
"RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'],
|
||||||
"UnstructuredFileLoader": ['.eml', '.msg', '.rst',
|
"UnstructuredFileLoader": ['.eml', '.msg', '.rst',
|
||||||
'.rtf', '.txt', '.xml',
|
'.rtf', '.txt', '.xml',
|
||||||
'.doc', '.docx', '.epub', '.odt',
|
'.doc', '.docx', '.epub', '.odt',
|
||||||
|
|
@ -196,6 +197,9 @@ class KnowledgeFile:
|
||||||
|
|
||||||
print(f"{self.document_loader_name} used for {self.filepath}")
|
print(f"{self.document_loader_name} used for {self.filepath}")
|
||||||
try:
|
try:
|
||||||
|
if self.document_loader_name in []:
|
||||||
|
document_loaders_module = importlib.import_module('document_loaders')
|
||||||
|
else:
|
||||||
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
||||||
DocumentLoader = getattr(document_loaders_module, self.document_loader_name)
|
DocumentLoader = getattr(document_loaders_module, self.document_loader_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 7.9 KiB |
Binary file not shown.
Loading…
Reference in New Issue