update requirements.txt, requirements_api.txt, test_different_splitter.py and chinese_recursive_text_splitter.py
This commit is contained in:
parent
1813814a65
commit
fbaca1009e
|
|
@ -18,6 +18,12 @@ spacy
|
|||
PyMuPDF==1.22.5
|
||||
rapidocr_onnxruntime>=1.3.2
|
||||
|
||||
requests
|
||||
pathlib
|
||||
pytest
|
||||
scikit-learn
|
||||
numexpr
|
||||
|
||||
# uncomment libs if you want to use corresponding vector store
|
||||
# pymilvus==2.1.3 # requires milvus==2.1.3
|
||||
# psycopg2
|
||||
|
|
|
|||
|
|
@ -9,18 +9,22 @@ nltk~=3.8.1
|
|||
uvicorn~=0.23.1
|
||||
starlette~=0.27.0
|
||||
pydantic~=1.10.11
|
||||
unstructured[all-docs]
|
||||
unstructured[all-docs]>=0.10.4
|
||||
python-magic-bin; sys_platform == 'win32'
|
||||
SQLAlchemy==2.0.19
|
||||
faiss-cpu
|
||||
nltk
|
||||
accelerate
|
||||
spacy
|
||||
PyMuPDF==1.22.5
|
||||
rapidocr_onnxruntime>=1.3.1
|
||||
websockets
|
||||
rapidocr_onnxruntime>=1.3.2
|
||||
|
||||
requests
|
||||
pathlib
|
||||
pytest
|
||||
scikit-learn
|
||||
numexpr
|
||||
|
||||
# uncomment libs if you want to use corresponding vector store
|
||||
# pymilvus==2.1.3 # requires milvus==2.1.3
|
||||
# psycopg2
|
||||
# pgvector
|
||||
# pgvector
|
||||
|
|
@ -20,22 +20,34 @@ def text(splitter_name):
|
|||
docs = loader.load()
|
||||
text_splitter = make_text_splitter(splitter_name, CHUNK_SIZE, OVERLAP_SIZE)
|
||||
if splitter_name == "MarkdownHeaderTextSplitter":
|
||||
split_docs = text_splitter.split_text(docs[0].page_content)
|
||||
docs = text_splitter.split_text(docs[0].page_content)
|
||||
for doc in docs:
|
||||
if doc.metadata:
|
||||
doc.metadata["source"] = os.path.basename(filepath)
|
||||
else:
|
||||
split_docs = text_splitter.split_documents(docs)
|
||||
docs = text_splitter.split_documents(docs)
|
||||
for doc in docs:
|
||||
print(doc)
|
||||
return docs
|
||||
|
||||
|
||||
|
||||
|
||||
import pytest
|
||||
@pytest.mark.parametrize("splitter_name", ["ChineseRecursiveTextSplitter", "SpacyTextSplitter", "RecursiveCharacterTextSplitter","MarkdownHeaderTextSplitter"])
|
||||
from langchain.docstore.document import Document
|
||||
|
||||
@pytest.mark.parametrize("splitter_name",
|
||||
[
|
||||
"ChineseRecursiveTextSplitter",
|
||||
"SpacyTextSplitter",
|
||||
"RecursiveCharacterTextSplitter",
|
||||
"MarkdownHeaderTextSplitter"
|
||||
])
|
||||
def test_different_splitter(splitter_name):
|
||||
try:
|
||||
docs = text(splitter_name)
|
||||
assert docs is not None
|
||||
assert isinstance(docs, list)
|
||||
if len(docs)>0:
|
||||
assert isinstance(docs[0], Document)
|
||||
except Exception as e:
|
||||
pytest.fail(f"test_different_splitter failed with {splitter_name}, error: {str(e)}")
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
if _good_splits:
|
||||
merged_text = self._merge_splits(_good_splits, _separator)
|
||||
final_chunks.extend(merged_text)
|
||||
return final_chunks
|
||||
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Reference in New Issue