diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 88739c9..5cf8433 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -29,7 +29,14 @@ $ git clone https://github.com/imClumsyPanda/langchain-ChatGLM.git # 进入目录 $ cd langchain-ChatGLM +# 使用paddleocr需要卸载detectron2避免tools冲突 +$ pip uninstall detectron2 + # 安装依赖 $ pip install -r requirements.txt + +# 验证paddleocr是否成功,首次运行会下载约18M模型到~/.paddleocr +$ python test_image.py + ``` 注:使用 `langchain.document_loaders.UnstructuredFileLoader` 进行非结构化文件接入时,可能需要依据文档进行其他依赖包的安装,请参考 [langchain 文档](https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/unstructured_file.html)。 diff --git a/docs/test.pdf b/docs/test.pdf new file mode 100644 index 0000000..3a137ad Binary files /dev/null and b/docs/test.pdf differ diff --git a/img/test.jpg b/img/test.jpg new file mode 100644 index 0000000..70c199b Binary files /dev/null and b/img/test.jpg differ diff --git a/loader/image_loader.py b/loader/image_loader.py index 9c215d1..d8e2047 100644 --- a/loader/image_loader.py +++ b/loader/image_loader.py @@ -27,8 +27,8 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader): txt_file_path = image_ocr_txt(self.file_path) from unstructured.partition.text import partition_text return partition_text(filename=txt_file_path, **self.unstructured_kwargs) - - + + if __name__ == "__main__": filepath = "../content/samples/test.jpg" loader = UnstructuredPaddleImageLoader(filepath, mode="elements") diff --git a/loader/pdf_loader.py b/loader/pdf_loader.py index 03666a7..ff886d6 100644 --- a/loader/pdf_loader.py +++ b/loader/pdf_loader.py @@ -44,9 +44,10 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): from unstructured.partition.text import partition_text return partition_text(filename=txt_file_path, **self.unstructured_kwargs) + if __name__ == "__main__": filepath = "../content/samples/test.pdf" loader = UnstructuredPaddlePDFLoader(filepath, mode="elements") docs = loader.load() for doc in docs: - print(doc) \ No newline at end of file + print(doc) diff --git a/test_image.py b/test_image.py new file mode 100644 index 0000000..ed60890 --- /dev/null +++ b/test_image.py @@ -0,0 +1,12 @@ +from configs.model_config import * +import nltk + +nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path + +filepath = "./img/test.jpg" +from loader import UnstructuredPaddleImageLoader + +loader = UnstructuredPaddleImageLoader(filepath, mode="elements") +docs = loader.load() +for doc in docs: + print(doc) diff --git a/test_pdf.py b/test_pdf.py new file mode 100644 index 0000000..32dcb34 --- /dev/null +++ b/test_pdf.py @@ -0,0 +1,12 @@ +from configs.model_config import * +import nltk + +nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path + +filepath = "docs/test.pdf" +from loader import UnstructuredPaddlePDFLoader + +loader = UnstructuredPaddlePDFLoader(filepath, mode="elements") +docs = loader.load() +for doc in docs: + print(doc)