Merge branch 'chatchat-space:dev' into dev

2023-09-01 13:57:23 +08:00 · 2023-09-01 13:57:23 +08:00 · 427646ff36
parent c54ee8c289 6c4ef26e9a
commit 427646ff36
8 changed files with 74 additions and 2 deletions
--- a/document_loaders/init.py
+++ b/document_loaders/init.py
@ -0,0 +1,2 @@
 from .mypdfloader import RapidOCRPDFLoader
 from .myimgloader import RapidOCRLoader
--- a/document_loaders/myimgloader.py
+++ b/document_loaders/myimgloader.py
@ -0,0 +1,25 @@
 from typing import List
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
 class RapidOCRLoader(UnstructuredFileLoader):
    def _get_elements(self) -> List:
        def img2text(filepath):
            from rapidocr_onnxruntime import RapidOCR
            resp = ""
            ocr = RapidOCR()
            result, _ = ocr(filepath)
            if result:
                ocr_result = [line[1] for line in result]
                resp += "\n".join(ocr_result)
            return resp
        text = img2text(self.file_path)
        from unstructured.partition.text import partition_text
        return partition_text(text=text, **self.unstructured_kwargs)
 if __name__ == "__main__":
    loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg")
    docs = loader.load()
    print(docs)
--- a/document_loaders/mypdfloader.py
+++ b/document_loaders/mypdfloader.py
@ -0,0 +1,37 @@
 from typing import List
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
 class RapidOCRPDFLoader(UnstructuredFileLoader):
    def _get_elements(self) -> List:
        def pdf2text(filepath):
            import fitz
            from rapidocr_onnxruntime import RapidOCR
            import numpy as np
            ocr = RapidOCR()
            doc = fitz.open(filepath)
            resp = ""
            for page in doc:
                # TODO: 依据文本与图片顺序调整处理方式
                text = page.get_text("")
                resp += text + "\n"
                img_list = page.get_images()
                for img in img_list:
                    pix = fitz.Pixmap(doc, img[0])
                    img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
                    result, _ = ocr(img_array)
                    if result:
                        ocr_result = [line[1] for line in result]
                        resp += "\n".join(ocr_result)
            return resp
        text = pdf2text(self.file_path)
        from unstructured.partition.text import partition_text
        return partition_text(text=text, **self.unstructured_kwargs)
 if __name__ == "__main__":
    loader = RapidOCRPDFLoader(file_path="../tests/samples/ocr_test.pdf")
    docs = loader.load()
    print(docs)
--- a/requirements.txt
+++ b/requirements.txt
@ -15,6 +15,8 @@ SQLAlchemy==2.0.19
 faiss-cpu
 accelerate
 spacy
 PyMuPDF==1.22.5
 rapidocr_onnxruntime>=1.3.1
 # uncomment libs if you want to use corresponding vector store
 # pymilvus==2.1.3 # requires milvus==2.1.3
--- a/requirements_api.txt
+++ b/requirements_api.txt
@ -16,6 +16,8 @@ faiss-cpu
 nltk
 accelerate
 spacy
 PyMuPDF==1.22.5
 rapidocr_onnxruntime>=1.3.1
 # uncomment libs if you want to use corresponding vector store
 # pymilvus==2.1.3 # requires milvus==2.1.3
--- a/server/knowledge_base/utils.py
+++ b/server/knowledge_base/utils.py
@ -87,7 +87,8 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
               "UnstructuredMarkdownLoader": ['.md'],
               "CustomJSONLoader": [".json"],
               "CSVLoader": [".csv"],
-               "PyPDFLoader": [".pdf"],
+               "RapidOCRPDFLoader": [".pdf"],
               "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'],
               "UnstructuredFileLoader": ['.eml', '.msg', '.rst',
                                          '.rtf', '.txt', '.xml',
                                          '.doc', '.docx', '.epub', '.odt',
@ -196,6 +197,9 @@ class KnowledgeFile:
        print(f"{self.document_loader_name} used for {self.filepath}")
        try:
            if self.document_loader_name in []:
                document_loaders_module = importlib.import_module('document_loaders')
            else:
                document_loaders_module = importlib.import_module('langchain.document_loaders')
            DocumentLoader = getattr(document_loaders_module, self.document_loader_name)
        except Exception as e:
--- a/tests/samples/ocr_test.jpg
+++ b/tests/samples/ocr_test.jpg
--- a/tests/samples/ocr_test.pdf
+++ b/tests/samples/ocr_test.pdf
		`@ -0,0 +1,2 @@`
							`from .mypdfloader import RapidOCRPDFLoader`
							`from .myimgloader import RapidOCRLoader`