使用paddleocr实现实现UnstructuredPaddlePDFLoader和UnstructuredPaddleImageLoader (#344)
* jpg and png ocr * fix * write docs to tmp file * fix * image loader * fix * fix * add pdf_loader * fix * update INSTALL.md --------- Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
This commit is contained in:
parent
ff8182f49a
commit
d2716addd6
|
|
@ -29,7 +29,14 @@ $ git clone https://github.com/imClumsyPanda/langchain-ChatGLM.git
|
|||
# 进入目录
|
||||
$ cd langchain-ChatGLM
|
||||
|
||||
# 使用paddleocr需要卸载detectron2避免tools冲突
|
||||
$ pip uninstall detectron2
|
||||
|
||||
# 安装依赖
|
||||
$ pip install -r requirements.txt
|
||||
|
||||
# 验证paddleocr是否成功,首次运行会下载约18M模型到~/.paddleocr
|
||||
$ python test_image.py
|
||||
|
||||
```
|
||||
注:使用 `langchain.document_loaders.UnstructuredFileLoader` 进行非结构化文件接入时,可能需要依据文档进行其他依赖包的安装,请参考 [langchain 文档](https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html)。
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 7.9 KiB |
|
|
@ -27,8 +27,8 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
|
|||
txt_file_path = image_ocr_txt(self.file_path)
|
||||
from unstructured.partition.text import partition_text
|
||||
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filepath = "../content/samples/test.jpg"
|
||||
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
|
||||
|
|
|
|||
|
|
@ -44,9 +44,10 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
|||
from unstructured.partition.text import partition_text
|
||||
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filepath = "../content/samples/test.pdf"
|
||||
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
print(doc)
|
||||
print(doc)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,12 @@
|
|||
from configs.model_config import *
|
||||
import nltk
|
||||
|
||||
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
||||
|
||||
filepath = "./img/test.jpg"
|
||||
from loader import UnstructuredPaddleImageLoader
|
||||
|
||||
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
print(doc)
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
from configs.model_config import *
|
||||
import nltk
|
||||
|
||||
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
||||
|
||||
filepath = "docs/test.pdf"
|
||||
from loader import UnstructuredPaddlePDFLoader
|
||||
|
||||
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
print(doc)
|
||||
Loading…
Reference in New Issue