update requirements.txt, requirements_api.txt, test_different_splitter.py and chinese_recursive_text_splitter.py
This commit is contained in:
parent
1813814a65
commit
fbaca1009e
|
|
@ -18,6 +18,12 @@ spacy
|
||||||
PyMuPDF==1.22.5
|
PyMuPDF==1.22.5
|
||||||
rapidocr_onnxruntime>=1.3.2
|
rapidocr_onnxruntime>=1.3.2
|
||||||
|
|
||||||
|
requests
|
||||||
|
pathlib
|
||||||
|
pytest
|
||||||
|
scikit-learn
|
||||||
|
numexpr
|
||||||
|
|
||||||
# uncomment libs if you want to use corresponding vector store
|
# uncomment libs if you want to use corresponding vector store
|
||||||
# pymilvus==2.1.3 # requires milvus==2.1.3
|
# pymilvus==2.1.3 # requires milvus==2.1.3
|
||||||
# psycopg2
|
# psycopg2
|
||||||
|
|
|
||||||
|
|
@ -9,16 +9,20 @@ nltk~=3.8.1
|
||||||
uvicorn~=0.23.1
|
uvicorn~=0.23.1
|
||||||
starlette~=0.27.0
|
starlette~=0.27.0
|
||||||
pydantic~=1.10.11
|
pydantic~=1.10.11
|
||||||
unstructured[all-docs]
|
unstructured[all-docs]>=0.10.4
|
||||||
python-magic-bin; sys_platform == 'win32'
|
python-magic-bin; sys_platform == 'win32'
|
||||||
SQLAlchemy==2.0.19
|
SQLAlchemy==2.0.19
|
||||||
faiss-cpu
|
faiss-cpu
|
||||||
nltk
|
|
||||||
accelerate
|
accelerate
|
||||||
spacy
|
spacy
|
||||||
PyMuPDF==1.22.5
|
PyMuPDF==1.22.5
|
||||||
rapidocr_onnxruntime>=1.3.1
|
rapidocr_onnxruntime>=1.3.2
|
||||||
websockets
|
|
||||||
|
requests
|
||||||
|
pathlib
|
||||||
|
pytest
|
||||||
|
scikit-learn
|
||||||
|
numexpr
|
||||||
|
|
||||||
# uncomment libs if you want to use corresponding vector store
|
# uncomment libs if you want to use corresponding vector store
|
||||||
# pymilvus==2.1.3 # requires milvus==2.1.3
|
# pymilvus==2.1.3 # requires milvus==2.1.3
|
||||||
|
|
|
||||||
|
|
@ -20,22 +20,34 @@ def text(splitter_name):
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE)
|
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE)
|
||||||
if splitter_name == "MarkdownHeaderTextSplitter":
|
if splitter_name == "MarkdownHeaderTextSplitter":
|
||||||
split_docs = text_splitter.split_text(docs[0].page_content)
|
docs = text_splitter.split_text(docs[0].page_content)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if doc.metadata:
|
if doc.metadata:
|
||||||
doc.metadata["source"] = os.path.basename(filepath)
|
doc.metadata["source"] = os.path.basename(filepath)
|
||||||
else:
|
else:
|
||||||
split_docs = text_splitter.split_documents(docs)
|
docs = text_splitter.split_documents(docs)
|
||||||
|
for doc in docs:
|
||||||
|
print(doc)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@pytest.mark.parametrize("splitter_name", ["ChineseRecursiveTextSplitter", "SpacyTextSplitter", "RecursiveCharacterTextSplitter","MarkdownHeaderTextSplitter"])
|
from langchain.docstore.document import Document
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("splitter_name",
|
||||||
|
[
|
||||||
|
"ChineseRecursiveTextSplitter",
|
||||||
|
"SpacyTextSplitter",
|
||||||
|
"RecursiveCharacterTextSplitter",
|
||||||
|
"MarkdownHeaderTextSplitter"
|
||||||
|
])
|
||||||
def test_different_splitter(splitter_name):
|
def test_different_splitter(splitter_name):
|
||||||
try:
|
try:
|
||||||
docs = text(splitter_name)
|
docs = text(splitter_name)
|
||||||
assert docs is not None
|
assert isinstance(docs, list)
|
||||||
|
if len(docs)>0:
|
||||||
|
assert isinstance(docs[0], Document)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}")
|
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}")
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
if _good_splits:
|
if _good_splits:
|
||||||
merged_text = self._merge_splits(_good_splits, _separator)
|
merged_text = self._merge_splits(_good_splits, _separator)
|
||||||
final_chunks.extend(merged_text)
|
final_chunks.extend(merged_text)
|
||||||
return final_chunks
|
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue