update requirements.txt, requirements_api.txt, test_different_splitter.py and chinese_recursive_text_splitter.py

This commit is contained in:
imClumsyPanda 2023-09-14 22:59:05 +08:00
parent 1813814a65
commit fbaca1009e
4 changed files with 32 additions and 10 deletions

View File

@ -18,6 +18,12 @@ spacy
PyMuPDF==1.22.5 PyMuPDF==1.22.5
rapidocr_onnxruntime>=1.3.2 rapidocr_onnxruntime>=1.3.2
requests
pathlib
pytest
scikit-learn
numexpr
# uncomment libs if you want to use corresponding vector store # uncomment libs if you want to use corresponding vector store
# pymilvus==2.1.3 # requires milvus==2.1.3 # pymilvus==2.1.3 # requires milvus==2.1.3
# psycopg2 # psycopg2

View File

@ -9,16 +9,20 @@ nltk~=3.8.1
uvicorn~=0.23.1 uvicorn~=0.23.1
starlette~=0.27.0 starlette~=0.27.0
pydantic~=1.10.11 pydantic~=1.10.11
unstructured[all-docs] unstructured[all-docs]>=0.10.4
python-magic-bin; sys_platform == 'win32' python-magic-bin; sys_platform == 'win32'
SQLAlchemy==2.0.19 SQLAlchemy==2.0.19
faiss-cpu faiss-cpu
nltk
accelerate accelerate
spacy spacy
PyMuPDF==1.22.5 PyMuPDF==1.22.5
rapidocr_onnxruntime>=1.3.1 rapidocr_onnxruntime>=1.3.2
websockets
requests
pathlib
pytest
scikit-learn
numexpr
# uncomment libs if you want to use corresponding vector store # uncomment libs if you want to use corresponding vector store
# pymilvus==2.1.3 # requires milvus==2.1.3 # pymilvus==2.1.3 # requires milvus==2.1.3

View File

@ -20,22 +20,34 @@ def text(splitter_name):
docs = loader.load() docs = loader.load()
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE) text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE)
if splitter_name == "MarkdownHeaderTextSplitter": if splitter_name == "MarkdownHeaderTextSplitter":
split_docs = text_splitter.split_text(docs[0].page_content) docs = text_splitter.split_text(docs[0].page_content)
for doc in docs: for doc in docs:
if doc.metadata: if doc.metadata:
doc.metadata["source"] = os.path.basename(filepath) doc.metadata["source"] = os.path.basename(filepath)
else: else:
split_docs = text_splitter.split_documents(docs) docs = text_splitter.split_documents(docs)
for doc in docs:
print(doc)
return docs return docs
import pytest import pytest
@pytest.mark.parametrize("splitter_name", ["ChineseRecursiveTextSplitter", "SpacyTextSplitter", "RecursiveCharacterTextSplitter","MarkdownHeaderTextSplitter"]) from langchain.docstore.document import Document
@pytest.mark.parametrize("splitter_name",
[
"ChineseRecursiveTextSplitter",
"SpacyTextSplitter",
"RecursiveCharacterTextSplitter",
"MarkdownHeaderTextSplitter"
])
def test_different_splitter(splitter_name): def test_different_splitter(splitter_name):
try: try:
docs = text(splitter_name) docs = text(splitter_name)
assert docs is not None assert isinstance(docs, list)
if len(docs)>0:
assert isinstance(docs[0], Document)
except Exception as e: except Exception as e:
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}") pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}")

View File

@ -83,7 +83,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if _good_splits: if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator) merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text) final_chunks.extend(merged_text)
return final_chunks return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
if __name__ == "__main__": if __name__ == "__main__":