From 1d12f84310981929b08cac82f354048d5e6aff88 Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Fri, 26 Jan 2024 14:18:57 +0800 Subject: [PATCH] fix the issue doc file can't be loaded --- document_loaders/__init__.py | 2 +- document_loaders/mywordload.py | 11 +++-- .../tools/search_knowledgebase_complex.py | 2 +- server/knowledge_base/utils.py | 2 +- server/utils.py | 49 ++++--------------- 5 files changed, 21 insertions(+), 45 deletions(-) diff --git a/document_loaders/__init__.py b/document_loaders/__init__.py index 8ad1da6..ff1f046 100644 --- a/document_loaders/__init__.py +++ b/document_loaders/__init__.py @@ -2,4 +2,4 @@ from .mypdfloader import RapidOCRPDFLoader from .myimgloader import RapidOCRLoader from .customiedpdfloader import CustomizedPDFLoader from .mywordload import RapidWordLoader -#from .customercore import custom_group_broken_paragraphs +from .customercore import custom_group_broken_paragraphs diff --git a/document_loaders/mywordload.py b/document_loaders/mywordload.py index 9c587d8..eecb653 100644 --- a/document_loaders/mywordload.py +++ b/document_loaders/mywordload.py @@ -5,11 +5,12 @@ from docx.document import Document as _Document from docx.table import _Cell from docx.oxml.text.paragraph import CT_P from docx.oxml.table import CT_Tbl +from docx.oxml.table import CT_TblGrid from docx.table import _Cell, Table from docx.text.paragraph import Paragraph from unstructured.partition.text import partition_text import unstructured.cleaners.core -from .customercore import custom_group_broken_paragraphs +from customercore import custom_group_broken_paragraphs unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs class RapidWordLoader(UnstructuredFileLoader): @@ -32,6 +33,10 @@ class RapidWordLoader(UnstructuredFileLoader): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) + elif isinstance(child, CT_TblGrid): + yield Table(child, parent) + else: + print(f"都不属于") def read_table(table): # 获取表格列标题 @@ -61,7 +66,7 @@ class RapidWordLoader(UnstructuredFileLoader): doc = docxDocument(filepath) for block in iter_block_items(doc): if isinstance(block,Paragraph): - #print(f"Paragraph:{block.text}") + print(f"Paragraph:{block.text}") resp += (block.text + "\n\n") elif isinstance(block, Table): resp += read_table(block) + "\n" @@ -76,7 +81,7 @@ class RapidWordLoader(UnstructuredFileLoader): return listText if __name__ == "__main__": - loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx") + loader = RapidWordLoader(file_path="/Users/wangvivi/Downloads/输变电设备风险评估导则.docx") #loader = Docx2txtLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx") #loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/MySelf/AI/Test/这是一个测试文档_副本2.docx") docs = loader.load() diff --git a/server/agent/tools/search_knowledgebase_complex.py b/server/agent/tools/search_knowledgebase_complex.py index af4d911..4664c70 100644 --- a/server/agent/tools/search_knowledgebase_complex.py +++ b/server/agent/tools/search_knowledgebase_complex.py @@ -11,7 +11,7 @@ from langchain.schema.language_model import BaseLanguageModel from typing import List, Any, Optional from langchain.prompts import PromptTemplate from server.chat.knowledge_base_chat import knowledge_base_chat -from configs import VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, MAX_TOKENS +from configs import FIRST_VECTOR_SEARCH_TOP_K, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, MAX_TOKENS import asyncio from server.agent import model_container from pydantic import BaseModel, Field diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index 75dce1d..84f0a29 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -123,7 +123,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], "UnstructuredPowerPointLoader": ['.ppt', '.pptx'], "EverNoteLoader": ['.enex'], "UnstructuredFileLoader": ['.txt'], - "Docx2txtLoader":['.doc'], + "UnstructuredWordDocumentLoader":['.doc'], "RapidWordLoader":['.docx'] } SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist] diff --git a/server/utils.py b/server/utils.py index 5512ee6..e0aff28 100644 --- a/server/utils.py +++ b/server/utils.py @@ -9,12 +9,12 @@ from configs import (LLM_MODELS, LLM_DEVICE, EMBEDDING_DEVICE, FSCHAT_MODEL_WORKERS, HTTPX_DEFAULT_TIMEOUT) import os from concurrent.futures import ThreadPoolExecutor, as_completed -from langchain.chat_models import ChatOpenAI -from langchain.llms import OpenAI +#from langchain.chat_models import ChatOpenAI +from langchain._api import ChatOpenAI +from langchain.llms import OpenAI, AzureOpenAI, Anthropic import httpx from typing import Literal, Optional, Callable, Generator, Dict, Any, Awaitable, Union, Tuple import logging -import torch async def wrap_done(fn: Awaitable, event: asyncio.Event): @@ -59,7 +59,6 @@ def get_ChatOpenAI( ) return model - def get_OpenAI( model_name: str, temperature: float, @@ -153,6 +152,7 @@ class ChatMessage(BaseModel): def torch_gc(): try: + import torch if torch.cuda.is_available(): # with torch.cuda.device(DEVICE): torch.cuda.empty_cache() @@ -500,58 +500,29 @@ def set_httpx_config( # 自动检查torch可用的设备。分布式部署时,不运行LLM的机器上可以不装torch -def is_mps_available(): - return hasattr(torch.backends, "mps") and torch.backends.mps.is_available() - - -def is_cuda_available(): - return torch.cuda.is_available() - - def detect_device() -> Literal["cuda", "mps", "cpu"]: try: + import torch if torch.cuda.is_available(): return "cuda" - if is_mps_available(): + if torch.backends.mps.is_available(): return "mps" except: pass return "cpu" -def llm_device(device: str = None) -> Literal["cuda", "mps", "cpu", "xpu"]: +def llm_device(device: str = None) -> Literal["cuda", "mps", "cpu"]: device = device or LLM_DEVICE - if device not in ["cuda", "mps", "cpu", "xpu"]: - logging.warning(f"device not in ['cuda', 'mps', 'cpu','xpu'], device = {device}") + if device not in ["cuda", "mps", "cpu"]: device = detect_device() - elif device == 'cuda' and not is_cuda_available() and is_mps_available(): - logging.warning("cuda is not available, fallback to mps") - return "mps" - if device == 'mps' and not is_mps_available() and is_cuda_available(): - logging.warning("mps is not available, fallback to cuda") - return "cuda" - - # auto detect device if not specified - if device not in ["cuda", "mps", "cpu", "xpu"]: - return detect_device() return device -def embedding_device(device: str = None) -> Literal["cuda", "mps", "cpu", "xpu"]: - device = device or LLM_DEVICE +def embedding_device(device: str = None) -> Literal["cuda", "mps", "cpu"]: + device = device or EMBEDDING_DEVICE if device not in ["cuda", "mps", "cpu"]: - logging.warning(f"device not in ['cuda', 'mps', 'cpu','xpu'], device = {device}") device = detect_device() - elif device == 'cuda' and not is_cuda_available() and is_mps_available(): - logging.warning("cuda is not available, fallback to mps") - return "mps" - if device == 'mps' and not is_mps_available() and is_cuda_available(): - logging.warning("mps is not available, fallback to cuda") - return "cuda" - - # auto detect device if not specified - if device not in ["cuda", "mps", "cpu"]: - return detect_device() return device