知识库支持 .jsonl, .epub, .xlsx, .xlsd, .ipynb, .odt, .py, .srt, .toml, .doc, .ppt 文件 (#2079)

* 知识库支持行式 json 文件如果要使用 json 文件，需要 `conda install jq`（windows 下 pip install jq 会失败）开发者：删除 CustomJsonLoader，使用 langchain 自带的 JsonLoader 处理 json 文件，添加 JsonLinesLoader 处理 jsonl 文件。 * 知识库支持 .epub, .xlsx, .xlsd, .ipynb, .odt, .py, .srt, .toml, .doc, .ppt 文件为 .eml, .msg, .rst, .rtf, .tsv, .docx, .xml, .pptx 指定专用加载器
2023-11-16 09:37:09 +08:00 · 2023-11-16 09:37:09 +08:00 · fbe214471b
parent 3b3d948d27
commit fbe214471b
2 changed files with 39 additions and 62 deletions
--- a/server/knowledge_base/utils.py
+++ b/server/knowledge_base/utils.py
@ -16,10 +16,9 @@ import langchain.document_loaders
 from langchain.docstore.document import Document
 from langchain.text_splitter import TextSplitter
 from pathlib import Path
-import json
 from server.utils import run_in_thread_pool, get_model_worker_config
 import io
-from typing import List, Union, Callable, Dict, Optional, Tuple, Generator
+from typing import List, Union,Dict, Tuple, Generator
 import chardet


@ -71,73 +70,41 @@ def list_files_from_folder(kb_name: str):

 LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
               "UnstructuredMarkdownLoader": ['.md'],
-               "CustomJSONLoader": [".json"],
+               "JSONLoader": [".json"],
+               "JSONLinesLoader": [".jsonl"],
               "CSVLoader": [".csv"],
               # "FilteredCSVLoader": [".csv"], # 需要自己指定，目前还没有支持
               "RapidOCRPDFLoader": [".pdf"],
               "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'],
-               "UnstructuredFileLoader": ['.eml', '.msg', '.rst',
-                                          '.rtf', '.txt', '.xml',
-                                          '.docx', '.epub', '.odt',
-                                          '.ppt', '.pptx', '.tsv'],
+               "UnstructuredEmailLoader": ['.eml', '.msg'],
+               "UnstructuredEPubLoader": ['.epub'],
+               "UnstructuredExcelLoader": ['.xlsx', '.xlsd'],
+               "NotebookLoader": ['.ipynb'],
+               "UnstructuredODTLoader": ['.odt'],
+               "PythonLoader": ['.py'],
+               "UnstructuredRSTLoader": ['.rst'],
+               "UnstructuredRTFLoader": ['.rtf'],
+               "SRTLoader": ['.srt'],
+               "TomlLoader": ['.toml'],
+               "UnstructuredTSVLoader": ['.tsv'],
+               "UnstructuredWordDocumentLoader": ['.docx', 'doc'],
+               "UnstructuredXMLLoader": ['.xml'],
+               "UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
+               "UnstructuredFileLoader": ['.txt'],
               }
 SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]


-class CustomJSONLoader(langchain.document_loaders.JSONLoader):
+class JSONLinesLoader(langchain.document_loaders.JSONLoader):
    '''
-    langchain的JSONLoader需要jq，在win上使用不便，进行替代。针对langchain==0.0.286
+    行式 Json 加载器，要求文件扩展名为 .jsonl
    '''
-
-    def __init__(
-            self,
-            file_path: Union[str, Path],
-            content_key: Optional[str] = None,
-            metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
-            text_content: bool = True,
-            json_lines: bool = False,
-    ):
-        """Initialize the JSONLoader.
-
-        Args:
-            file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
-            content_key (str): The key to use to extract the content from the JSON if
-                results to a list of objects (dict).
-            metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
-                object extracted by the jq_schema and the default metadata and returns
-                a dict of the updated metadata.
-            text_content (bool): Boolean flag to indicate whether the content is in
-                string format, default to True.
-            json_lines (bool): Boolean flag to indicate whether the input is in
-                JSON Lines format.
-        """
-        self.file_path = Path(file_path).resolve()
-        self._content_key = content_key
-        self._metadata_func = metadata_func
-        self._text_content = text_content
-        self._json_lines = json_lines
-
-    def _parse(self, content: str, docs: List[Document]) -> None:
-        """Convert given content to documents."""
-        data = json.loads(content)
-
-        # Perform some validation
-        # This is not a perfect validation, but it should catch most cases
-        # and prevent the user from getting a cryptic error later on.
-        if self._content_key is not None:
-            self._validate_content_key(data)
-        if self._metadata_func is not None:
-            self._validate_metadata_func(data)
-
-        for i, sample in enumerate(data, len(docs) + 1):
-            text = self._get_text(sample=sample)
-            metadata = self._get_metadata(
-                sample=sample, source=str(self.file_path), seq_num=i
-            )
-            docs.append(Document(page_content=text, metadata=metadata))
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._json_lines = True


-langchain.document_loaders.CustomJSONLoader = CustomJSONLoader
+langchain.document_loaders.JSONLinesLoader = JSONLinesLoader


 def get_LoaderClass(file_extension):
@ -178,12 +145,22 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri

    elif loader_name == "JSONLoader":
        loader = DocumentLoader(file_path_or_content, jq_schema=".", text_content=False)
-    elif loader_name == "CustomJSONLoader":
-        loader = DocumentLoader(file_path_or_content, text_content=False)
+    elif loader_name == "JSONLinesLoader":
+        loader = DocumentLoader(file_path_or_content, jq_schema=".", text_content=False, json_lines=True)
    elif loader_name == "UnstructuredMarkdownLoader":
        loader = DocumentLoader(file_path_or_content, mode="elements")
    elif loader_name == "UnstructuredHTMLLoader":
        loader = DocumentLoader(file_path_or_content, mode="elements")
+    elif loader_name == "UnstructuredXMLLoader":
+        loader = DocumentLoader(file_path_or_content, mode="elements")
+    elif loader_name == "UnstructuredRSTLoader":
+        loader = DocumentLoader(file_path_or_content, mode="elements")
+    elif loader_name == "UnstructuredExcelLoader":
+        loader = DocumentLoader(file_path_or_content, mode="elements")
+    elif loader_name == "UnstructuredWordDocumentLoader":
+        loader = DocumentLoader(file_path_or_content, mode="elements")
+    elif loader_name == "UnstructuredPowerPointLoader":
+        loader = DocumentLoader(file_path_or_content, mode="elements")
    else:
        loader = DocumentLoader(file_path_or_content)
    return loader
--- a/tests/kb_vector_db/test_faiss_kb.py
+++ b/tests/kb_vector_db/test_faiss_kb.py
@ -2,6 +2,7 @@ from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
 from server.knowledge_base.migrate import create_tables
 from server.knowledge_base.utils import KnowledgeFile

+
 kbService = FaissKBService("test")
 test_kb_name = "test"
 test_file_name = "README.md"
@ -24,12 +25,11 @@ def test_add_doc():
 def test_search_db():
    result = kbService.search_docs(search_content)
    assert len(result) > 0
+
+
 def test_delete_doc():
    assert kbService.delete_doc(testKnowledgeFile)


-
-
-
 def test_delete_db():
    assert kbService.drop_kb()