使用paddleocr实现实现UnstructuredPaddlePDFLoader和UnstructuredPaddleImageLoader (#344)

* jpg and png ocr * fix * write docs to tmp file * fix * image loader * fix * fix * add pdf_loader * fix * update INSTALL.md --------- Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
2023-05-13 11:13:40 +08:00 · 2023-05-13 11:13:40 +08:00 · d2716addd6
parent ff8182f49a
commit d2716addd6
7 changed files with 35 additions and 3 deletions
--- a/docs/INSTALL.md
+++ b/docs/INSTALL.md
@ -29,7 +29,14 @@ $ git clone https://github.com/imClumsyPanda/langchain-ChatGLM.git
 # 进入目录
 $ cd langchain-ChatGLM

+# 使用paddleocr需要卸载detectron2避免tools冲突
+$ pip uninstall detectron2
+
 # 安装依赖
 $ pip install -r requirements.txt
+
+# 验证paddleocr是否成功，首次运行会下载约18M模型到~/.paddleocr
+$ python test_image.py
+
 ```
 注：使用 `langchain.document_loaders.UnstructuredFileLoader` 进行非结构化文件接入时，可能需要依据文档进行其他依赖包的安装，请参考 [langchain 文档](https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html)。
--- a/docs/test.pdf
+++ b/docs/test.pdf
--- a/img/test.jpg
+++ b/img/test.jpg
--- a/loader/image_loader.py
+++ b/loader/image_loader.py
@ -27,8 +27,8 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
        txt_file_path = image_ocr_txt(self.file_path)
        from unstructured.partition.text import partition_text
        return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
-
-
+      
+      
 if __name__ == "__main__":
    filepath = "../content/samples/test.jpg"
    loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
--- a/loader/pdf_loader.py
+++ b/loader/pdf_loader.py
@ -44,9 +44,10 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
        from unstructured.partition.text import partition_text
        return partition_text(filename=txt_file_path, **self.unstructured_kwargs)

+
 if __name__ == "__main__":
    filepath = "../content/samples/test.pdf"
    loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
    docs = loader.load()
    for doc in docs:
-        print(doc)
+        print(doc)
--- a/test_image.py
+++ b/test_image.py
@ -0,0 +1,12 @@
+from configs.model_config import *
+import nltk
+
+nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
+
+filepath = "./img/test.jpg"
+from loader import UnstructuredPaddleImageLoader
+
+loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
+docs = loader.load()
+for doc in docs:
+    print(doc)
--- a/test_pdf.py
+++ b/test_pdf.py
@ -0,0 +1,12 @@
+from configs.model_config import *
+import nltk
+
+nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
+
+filepath = "docs/test.pdf"
+from loader import UnstructuredPaddlePDFLoader
+
+loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
+docs = loader.load()
+for doc in docs:
+    print(doc)