From fbaca1009e44887c92aa266f2dc00c1a97c9ba37 Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Thu, 14 Sep 2023 22:59:05 +0800 Subject: [PATCH] update requirements.txt, requirements_api.txt, test_different_splitter.py and chinese_recursive_text_splitter.py --- requirements.txt | 6 ++++++ requirements_api.txt | 14 ++++++++----- .../test_different_splitter.py | 20 +++++++++++++++---- .../chinese_recursive_text_splitter.py | 2 +- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index abd0f1b..5714788 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,12 @@ spacy PyMuPDF==1.22.5 rapidocr_onnxruntime>=1.3.2 +requests +pathlib +pytest +scikit-learn +numexpr + # uncomment libs if you want to use corresponding vector store # pymilvus==2.1.3 # requires milvus==2.1.3 # psycopg2 diff --git a/requirements_api.txt b/requirements_api.txt index 9a05708..c56c07b 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -9,18 +9,22 @@ nltk~=3.8.1 uvicorn~=0.23.1 starlette~=0.27.0 pydantic~=1.10.11 -unstructured[all-docs] +unstructured[all-docs]>=0.10.4 python-magic-bin; sys_platform == 'win32' SQLAlchemy==2.0.19 faiss-cpu -nltk accelerate spacy PyMuPDF==1.22.5 -rapidocr_onnxruntime>=1.3.1 -websockets +rapidocr_onnxruntime>=1.3.2 + +requests +pathlib +pytest +scikit-learn +numexpr # uncomment libs if you want to use corresponding vector store # pymilvus==2.1.3 # requires milvus==2.1.3 # psycopg2 -# pgvector +# pgvector \ No newline at end of file diff --git a/tests/custom_splitter/test_different_splitter.py b/tests/custom_splitter/test_different_splitter.py index 9993a9c..fea597e 100644 --- a/tests/custom_splitter/test_different_splitter.py +++ b/tests/custom_splitter/test_different_splitter.py @@ -20,22 +20,34 @@ def text(splitter_name): docs = loader.load() text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE) if splitter_name == "MarkdownHeaderTextSplitter": - split_docs = text_splitter.split_text(docs[0].page_content) + docs = text_splitter.split_text(docs[0].page_content) for doc in docs: if doc.metadata: doc.metadata["source"] = os.path.basename(filepath) else: - split_docs = text_splitter.split_documents(docs) + docs = text_splitter.split_documents(docs) + for doc in docs: + print(doc) return docs import pytest -@pytest.mark.parametrize("splitter_name", ["ChineseRecursiveTextSplitter", "SpacyTextSplitter", "RecursiveCharacterTextSplitter","MarkdownHeaderTextSplitter"]) +from langchain.docstore.document import Document + +@pytest.mark.parametrize("splitter_name", + [ + "ChineseRecursiveTextSplitter", + "SpacyTextSplitter", + "RecursiveCharacterTextSplitter", + "MarkdownHeaderTextSplitter" + ]) def test_different_splitter(splitter_name): try: docs = text(splitter_name) - assert docs is not None + assert isinstance(docs, list) + if len(docs)>0: + assert isinstance(docs[0], Document) except Exception as e: pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}") diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 05b57b1..70b4b29 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -83,7 +83,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): if _good_splits: merged_text = self._merge_splits(_good_splits, _separator) final_chunks.extend(merged_text) - return final_chunks + return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""] if __name__ == "__main__":