From 313e59096174b6de0c046385808b134b6a743e4a Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Sat, 5 Aug 2023 21:51:07 +0800 Subject: [PATCH] update DocumentLoader in knowledge_file.py --- server/knowledge_base/knowledge_file.py | 26 +++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/server/knowledge_base/knowledge_file.py b/server/knowledge_base/knowledge_file.py index b00e616..0ce66a4 100644 --- a/server/knowledge_base/knowledge_file.py +++ b/server/knowledge_base/knowledge_file.py @@ -1,6 +1,20 @@ import os.path from server.knowledge_base.utils import (get_file_path) from server.knowledge_base import KnowledgeBase +import sys + +LOADER_DICT = {"UnstructuredFileLoader": ['.eml', '.html', '.json', '.md', '.msg', '.rst', + '.rtf', '.txt', '.xml', + '.doc', '.docx', '.epub', '.odt', '.pdf', + '.ppt', '.pptx', '.tsv'], # '.pdf', '.xlsx', '.csv' + "CSVLoader": [".csv"], + } +SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist] + +def get_LoaderClass(file_extension): + for LoaderClass, extensions in LOADER_DICT.items(): + if file_extension in extensions: + return LoaderClass class KnowledgeFile: @@ -12,16 +26,16 @@ class KnowledgeFile: self.kb = KnowledgeBase.load(knowledge_base_name) self.filename = filename self.ext = os.path.splitext(filename)[-1] + if self.ext not in SUPPORTED_EXTS: + raise ValueError(f"暂未支持的文件格式 {self.ext}") self.filepath = get_file_path(knowledge_base_name, filename) self.docs = None + self.loader_class_name = get_LoaderClass(self.ext) def file2text(self): - if self.ext in []: - from langchain.document_loaders import UnstructuredFileLoader - loader = UnstructuredFileLoader(self.filepath) - elif self.ext in []: - pass + LoaderClass = getattr(sys.modules['langchain.document_loaders'], self.loader_class_name) + loader = LoaderClass(self.filepath) from langchain.text_splitter import CharacterTextSplitter text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=200) - return loader.load_and_split(text_splitter) \ No newline at end of file + return loader.load_and_split(text_splitter)