From fbe214471b573abbcc5c30e2bd09233b5c4ef657 Mon Sep 17 00:00:00 2001 From: liunux4odoo <41217877+liunux4odoo@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:37:09 +0800 Subject: [PATCH] =?UTF-8?q?=E7=9F=A5=E8=AF=86=E5=BA=93=E6=94=AF=E6=8C=81?= =?UTF-8?q?=20.jsonl,=20.epub,=20.xlsx,=20.xlsd,=20.ipynb,=20.odt,=20.py,?= =?UTF-8?q?=20.srt,=20.toml,=20.doc,=20.ppt=20=E6=96=87=E4=BB=B6=20(#2079)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 知识库支持行式 json 文件 如果要使用 json 文件, 需要 `conda install jq`(windows 下 pip install jq 会失败) 开发者: 删除 CustomJsonLoader,使用 langchain 自带的 JsonLoader 处理 json 文件,添加 JsonLinesLoader 处理 jsonl 文件。 * 知识库支持 .epub, .xlsx, .xlsd, .ipynb, .odt, .py, .srt, .toml, .doc, .ppt 文件 为 .eml, .msg, .rst, .rtf, .tsv, .docx, .xml, .pptx 指定专用加载器 --- server/knowledge_base/utils.py | 95 +++++++++++------------------ tests/kb_vector_db/test_faiss_kb.py | 6 +- 2 files changed, 39 insertions(+), 62 deletions(-) diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index d6720ec..2f91652 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -16,10 +16,9 @@ import langchain.document_loaders from langchain.docstore.document import Document from langchain.text_splitter import TextSplitter from pathlib import Path -import json from server.utils import run_in_thread_pool, get_model_worker_config import io -from typing import List, Union, Callable, Dict, Optional, Tuple, Generator +from typing import List, Union,Dict, Tuple, Generator import chardet @@ -71,73 +70,41 @@ def list_files_from_folder(kb_name: str): LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], "UnstructuredMarkdownLoader": ['.md'], - "CustomJSONLoader": [".json"], + "JSONLoader": [".json"], + "JSONLinesLoader": [".jsonl"], "CSVLoader": [".csv"], # "FilteredCSVLoader": [".csv"], # 需要自己指定,目前还没有支持 "RapidOCRPDFLoader": [".pdf"], "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'], - "UnstructuredFileLoader": ['.eml', '.msg', '.rst', - '.rtf', '.txt', '.xml', - '.docx', '.epub', '.odt', - '.ppt', '.pptx', '.tsv'], + "UnstructuredEmailLoader": ['.eml', '.msg'], + "UnstructuredEPubLoader": ['.epub'], + "UnstructuredExcelLoader": ['.xlsx', '.xlsd'], + "NotebookLoader": ['.ipynb'], + "UnstructuredODTLoader": ['.odt'], + "PythonLoader": ['.py'], + "UnstructuredRSTLoader": ['.rst'], + "UnstructuredRTFLoader": ['.rtf'], + "SRTLoader": ['.srt'], + "TomlLoader": ['.toml'], + "UnstructuredTSVLoader": ['.tsv'], + "UnstructuredWordDocumentLoader": ['.docx', 'doc'], + "UnstructuredXMLLoader": ['.xml'], + "UnstructuredPowerPointLoader": ['.ppt', '.pptx'], + "UnstructuredFileLoader": ['.txt'], } SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist] -class CustomJSONLoader(langchain.document_loaders.JSONLoader): +class JSONLinesLoader(langchain.document_loaders.JSONLoader): ''' - langchain的JSONLoader需要jq,在win上使用不便,进行替代。针对langchain==0.0.286 + 行式 Json 加载器,要求文件扩展名为 .jsonl ''' - - def __init__( - self, - file_path: Union[str, Path], - content_key: Optional[str] = None, - metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None, - text_content: bool = True, - json_lines: bool = False, - ): - """Initialize the JSONLoader. - - Args: - file_path (Union[str, Path]): The path to the JSON or JSON Lines file. - content_key (str): The key to use to extract the content from the JSON if - results to a list of objects (dict). - metadata_func (Callable[Dict, Dict]): A function that takes in the JSON - object extracted by the jq_schema and the default metadata and returns - a dict of the updated metadata. - text_content (bool): Boolean flag to indicate whether the content is in - string format, default to True. - json_lines (bool): Boolean flag to indicate whether the input is in - JSON Lines format. - """ - self.file_path = Path(file_path).resolve() - self._content_key = content_key - self._metadata_func = metadata_func - self._text_content = text_content - self._json_lines = json_lines - - def _parse(self, content: str, docs: List[Document]) -> None: - """Convert given content to documents.""" - data = json.loads(content) - - # Perform some validation - # This is not a perfect validation, but it should catch most cases - # and prevent the user from getting a cryptic error later on. - if self._content_key is not None: - self._validate_content_key(data) - if self._metadata_func is not None: - self._validate_metadata_func(data) - - for i, sample in enumerate(data, len(docs) + 1): - text = self._get_text(sample=sample) - metadata = self._get_metadata( - sample=sample, source=str(self.file_path), seq_num=i - ) - docs.append(Document(page_content=text, metadata=metadata)) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._json_lines = True -langchain.document_loaders.CustomJSONLoader = CustomJSONLoader +langchain.document_loaders.JSONLinesLoader = JSONLinesLoader def get_LoaderClass(file_extension): @@ -178,12 +145,22 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri elif loader_name == "JSONLoader": loader = DocumentLoader(file_path_or_content, jq_schema=".", text_content=False) - elif loader_name == "CustomJSONLoader": - loader = DocumentLoader(file_path_or_content, text_content=False) + elif loader_name == "JSONLinesLoader": + loader = DocumentLoader(file_path_or_content, jq_schema=".", text_content=False, json_lines=True) elif loader_name == "UnstructuredMarkdownLoader": loader = DocumentLoader(file_path_or_content, mode="elements") elif loader_name == "UnstructuredHTMLLoader": loader = DocumentLoader(file_path_or_content, mode="elements") + elif loader_name == "UnstructuredXMLLoader": + loader = DocumentLoader(file_path_or_content, mode="elements") + elif loader_name == "UnstructuredRSTLoader": + loader = DocumentLoader(file_path_or_content, mode="elements") + elif loader_name == "UnstructuredExcelLoader": + loader = DocumentLoader(file_path_or_content, mode="elements") + elif loader_name == "UnstructuredWordDocumentLoader": + loader = DocumentLoader(file_path_or_content, mode="elements") + elif loader_name == "UnstructuredPowerPointLoader": + loader = DocumentLoader(file_path_or_content, mode="elements") else: loader = DocumentLoader(file_path_or_content) return loader diff --git a/tests/kb_vector_db/test_faiss_kb.py b/tests/kb_vector_db/test_faiss_kb.py index 9c329c8..0264c89 100644 --- a/tests/kb_vector_db/test_faiss_kb.py +++ b/tests/kb_vector_db/test_faiss_kb.py @@ -2,6 +2,7 @@ from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService from server.knowledge_base.migrate import create_tables from server.knowledge_base.utils import KnowledgeFile + kbService = FaissKBService("test") test_kb_name = "test" test_file_name = "README.md" @@ -24,12 +25,11 @@ def test_add_doc(): def test_search_db(): result = kbService.search_docs(search_content) assert len(result) > 0 + + def test_delete_doc(): assert kbService.delete_doc(testKnowledgeFile) - - - def test_delete_db(): assert kbService.drop_kb()