使用paddleocr实现实现UnstructuredPaddlePDFLoader和UnstructuredPaddleImageLoader (#344)
* jpg and png ocr * fix * write docs to tmp file * fix * image loader * fix * fix * add pdf_loader * fix * update INSTALL.md --------- Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
This commit is contained in:
parent
ff8182f49a
commit
d2716addd6
|
|
@ -29,7 +29,14 @@ $ git clone https://github.com/imClumsyPanda/langchain-ChatGLM.git
|
||||||
# 进入目录
|
# 进入目录
|
||||||
$ cd langchain-ChatGLM
|
$ cd langchain-ChatGLM
|
||||||
|
|
||||||
|
# 使用paddleocr需要卸载detectron2避免tools冲突
|
||||||
|
$ pip uninstall detectron2
|
||||||
|
|
||||||
# 安装依赖
|
# 安装依赖
|
||||||
$ pip install -r requirements.txt
|
$ pip install -r requirements.txt
|
||||||
|
|
||||||
|
# 验证paddleocr是否成功,首次运行会下载约18M模型到~/.paddleocr
|
||||||
|
$ python test_image.py
|
||||||
|
|
||||||
```
|
```
|
||||||
注:使用 `langchain.document_loaders.UnstructuredFileLoader` 进行非结构化文件接入时,可能需要依据文档进行其他依赖包的安装,请参考 [langchain 文档](https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html)。
|
注:使用 `langchain.document_loaders.UnstructuredFileLoader` 进行非结构化文件接入时,可能需要依据文档进行其他依赖包的安装,请参考 [langchain 文档](https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html)。
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 7.9 KiB |
|
|
@ -27,8 +27,8 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
|
||||||
txt_file_path = image_ocr_txt(self.file_path)
|
txt_file_path = image_ocr_txt(self.file_path)
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import partition_text
|
||||||
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
filepath = "../content/samples/test.jpg"
|
filepath = "../content/samples/test.jpg"
|
||||||
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
|
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
|
||||||
|
|
|
||||||
|
|
@ -44,9 +44,10 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import partition_text
|
||||||
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
filepath = "../content/samples/test.pdf"
|
filepath = "../content/samples/test.pdf"
|
||||||
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
print(doc)
|
print(doc)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
from configs.model_config import *
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
||||||
|
|
||||||
|
filepath = "./img/test.jpg"
|
||||||
|
from loader import UnstructuredPaddleImageLoader
|
||||||
|
|
||||||
|
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
|
||||||
|
docs = loader.load()
|
||||||
|
for doc in docs:
|
||||||
|
print(doc)
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
from configs.model_config import *
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
||||||
|
|
||||||
|
filepath = "docs/test.pdf"
|
||||||
|
from loader import UnstructuredPaddlePDFLoader
|
||||||
|
|
||||||
|
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
||||||
|
docs = loader.load()
|
||||||
|
for doc in docs:
|
||||||
|
print(doc)
|
||||||
Loading…
Reference in New Issue