update requirements.txt, requirements_api.txt, test_different_splitter.py and chinese_recursive_text_splitter.py

This commit is contained in:
imClumsyPanda 2023-09-14 22:59:05 +08:00
parent 1813814a65
commit fbaca1009e
4 changed files with 32 additions and 10 deletions

View File

@ -18,6 +18,12 @@ spacy
PyMuPDF==1.22.5
rapidocr_onnxruntime>=1.3.2
requests
pathlib
pytest
scikit-learn
numexpr
# uncomment libs if you want to use corresponding vector store
# pymilvus==2.1.3 # requires milvus==2.1.3
# psycopg2

View File

@ -9,18 +9,22 @@ nltk~=3.8.1
uvicorn~=0.23.1
starlette~=0.27.0
pydantic~=1.10.11
unstructured[all-docs]
unstructured[all-docs]>=0.10.4
python-magic-bin; sys_platform == 'win32'
SQLAlchemy==2.0.19
faiss-cpu
nltk
accelerate
spacy
PyMuPDF==1.22.5
rapidocr_onnxruntime>=1.3.1
websockets
rapidocr_onnxruntime>=1.3.2
requests
pathlib
pytest
scikit-learn
numexpr
# uncomment libs if you want to use corresponding vector store
# pymilvus==2.1.3 # requires milvus==2.1.3
# psycopg2
# pgvector
# pgvector

View File

@ -20,22 +20,34 @@ def text(splitter_name):
docs = loader.load()
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE)
if splitter_name == "MarkdownHeaderTextSplitter":
split_docs = text_splitter.split_text(docs[0].page_content)
docs = text_splitter.split_text(docs[0].page_content)
for doc in docs:
if doc.metadata:
doc.metadata["source"] = os.path.basename(filepath)
else:
split_docs = text_splitter.split_documents(docs)
docs = text_splitter.split_documents(docs)
for doc in docs:
print(doc)
return docs
import pytest
@pytest.mark.parametrize("splitter_name", ["ChineseRecursiveTextSplitter", "SpacyTextSplitter", "RecursiveCharacterTextSplitter","MarkdownHeaderTextSplitter"])
from langchain.docstore.document import Document
@pytest.mark.parametrize("splitter_name",
[
"ChineseRecursiveTextSplitter",
"SpacyTextSplitter",
"RecursiveCharacterTextSplitter",
"MarkdownHeaderTextSplitter"
])
def test_different_splitter(splitter_name):
try:
docs = text(splitter_name)
assert docs is not None
assert isinstance(docs, list)
if len(docs)>0:
assert isinstance(docs[0], Document)
except Exception as e:
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}")

View File

@ -83,7 +83,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
return final_chunks
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
if __name__ == "__main__":