update local_doc_qa.py
This commit is contained in:
parent
6d1523728b
commit
2681728329
|
|
@ -1,6 +1,6 @@
|
||||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||||
from langchain.vectorstores import FAISS
|
from langchain.vectorstores import FAISS
|
||||||
from langchain.document_loaders import UnstructuredFileLoader
|
from langchain.document_loaders import UnstructuredFileLoader, TextLoader
|
||||||
from configs.model_config import *
|
from configs.model_config import *
|
||||||
import datetime
|
import datetime
|
||||||
from textsplitter import ChineseTextSplitter
|
from textsplitter import ChineseTextSplitter
|
||||||
|
|
@ -10,8 +10,7 @@ import numpy as np
|
||||||
from utils import torch_gc
|
from utils import torch_gc
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from pypinyin import lazy_pinyin
|
from pypinyin import lazy_pinyin
|
||||||
from loader import UnstructuredPaddleImageLoader
|
from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader
|
||||||
from loader import UnstructuredPaddlePDFLoader
|
|
||||||
from models.base import (BaseAnswer,
|
from models.base import (BaseAnswer,
|
||||||
AnswerResult,
|
AnswerResult,
|
||||||
AnswerResultStream,
|
AnswerResultStream,
|
||||||
|
|
@ -21,14 +20,14 @@ from models.loader import LoaderCheckPoint
|
||||||
import models.shared as shared
|
import models.shared as shared
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
||||||
if filepath.lower().endswith(".md"):
|
if filepath.lower().endswith(".md"):
|
||||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
elif filepath.lower().endswith(".txt"):
|
elif filepath.lower().endswith(".txt"):
|
||||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
loader = TextLoader(filepath, autodetect_encoding=True)
|
||||||
docs = loader.load()
|
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
|
||||||
|
docs = loader.load_and_split(textsplitter)
|
||||||
elif filepath.lower().endswith(".pdf"):
|
elif filepath.lower().endswith(".pdf"):
|
||||||
loader = UnstructuredPaddlePDFLoader(filepath)
|
loader = UnstructuredPaddlePDFLoader(filepath)
|
||||||
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
|
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue