From 1d12f84310981929b08cac82f354048d5e6aff88 Mon Sep 17 00:00:00 2001
From: wvivi2023 <wang272160587@gmail.com>
Date: Fri, 26 Jan 2024 14:18:57 +0800
Subject: [PATCH] fix the issue doc file can't be loaded

---
 document_loaders/__init__.py                  |  2 +-
 document_loaders/mywordload.py                | 11 +++--
 .../tools/search_knowledgebase_complex.py     |  2 +-
 server/knowledge_base/utils.py                |  2 +-
 server/utils.py                               | 49 ++++---------------
 5 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/document_loaders/__init__.py b/document_loaders/__init__.py
index 8ad1da6..ff1f046 100644
--- a/document_loaders/__init__.py
+++ b/document_loaders/__init__.py
@@ -2,4 +2,4 @@ from .mypdfloader import RapidOCRPDFLoader
 from .myimgloader import RapidOCRLoader
 from .customiedpdfloader import CustomizedPDFLoader
 from .mywordload import RapidWordLoader
-#from .customercore import custom_group_broken_paragraphs
+from .customercore import custom_group_broken_paragraphs
diff --git a/document_loaders/mywordload.py b/document_loaders/mywordload.py
index 9c587d8..eecb653 100644
--- a/document_loaders/mywordload.py
+++ b/document_loaders/mywordload.py
@@ -5,11 +5,12 @@ from docx.document import Document as _Document
 from docx.table import _Cell
 from docx.oxml.text.paragraph import CT_P
 from docx.oxml.table import CT_Tbl
+from docx.oxml.table import CT_TblGrid
 from docx.table import _Cell, Table
 from docx.text.paragraph import Paragraph
 from unstructured.partition.text import partition_text
 import unstructured.cleaners.core
-from .customercore import custom_group_broken_paragraphs
+from customercore import custom_group_broken_paragraphs
 unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs
 
 class RapidWordLoader(UnstructuredFileLoader):
@@ -32,6 +33,10 @@ class RapidWordLoader(UnstructuredFileLoader):
                     yield Paragraph(child, parent)
                 elif isinstance(child, CT_Tbl):
                     yield Table(child, parent)
+                elif isinstance(child, CT_TblGrid):
+                    yield Table(child, parent)
+                else:
+                    print(f"都不属于")
 
         def read_table(table):
             # 获取表格列标题
@@ -61,7 +66,7 @@ class RapidWordLoader(UnstructuredFileLoader):
                 doc = docxDocument(filepath)
                 for block in iter_block_items(doc):
                     if isinstance(block,Paragraph):
-                        #print(f"Paragraph:{block.text}")
+                        print(f"Paragraph:{block.text}")
                         resp += (block.text + "\n\n")
                     elif isinstance(block, Table):
                         resp += read_table(block) + "\n"
@@ -76,7 +81,7 @@ class RapidWordLoader(UnstructuredFileLoader):
         return listText
 
 if __name__ == "__main__":
-    loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")
+    loader = RapidWordLoader(file_path="/Users/wangvivi/Downloads/输变电设备风险评估导则.docx")
     #loader = Docx2txtLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")
     #loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/MySelf/AI/Test/这是一个测试文档_副本2.docx")
     docs = loader.load()
diff --git a/server/agent/tools/search_knowledgebase_complex.py b/server/agent/tools/search_knowledgebase_complex.py
index af4d911..4664c70 100644
--- a/server/agent/tools/search_knowledgebase_complex.py
+++ b/server/agent/tools/search_knowledgebase_complex.py
@@ -11,7 +11,7 @@ from langchain.schema.language_model import BaseLanguageModel
 from typing import List, Any, Optional
 from langchain.prompts import PromptTemplate
 from server.chat.knowledge_base_chat import knowledge_base_chat
-from configs import VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, MAX_TOKENS
+from configs import FIRST_VECTOR_SEARCH_TOP_K, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, MAX_TOKENS
 import asyncio
 from server.agent import model_container
 from pydantic import BaseModel, Field
diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py
index 75dce1d..84f0a29 100644
--- a/server/knowledge_base/utils.py
+++ b/server/knowledge_base/utils.py
@@ -123,7 +123,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
                "UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
                "EverNoteLoader": ['.enex'],
                "UnstructuredFileLoader": ['.txt'],
-               "Docx2txtLoader":['.doc'],
+               "UnstructuredWordDocumentLoader":['.doc'],
                "RapidWordLoader":['.docx']
                }
 SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
diff --git a/server/utils.py b/server/utils.py
index 5512ee6..e0aff28 100644
--- a/server/utils.py
+++ b/server/utils.py
@@ -9,12 +9,12 @@ from configs import (LLM_MODELS, LLM_DEVICE, EMBEDDING_DEVICE,
                      FSCHAT_MODEL_WORKERS, HTTPX_DEFAULT_TIMEOUT)
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from langchain.chat_models import ChatOpenAI
-from langchain.llms import OpenAI
+#from langchain.chat_models import ChatOpenAI
+from langchain._api import ChatOpenAI 
+from langchain.llms import OpenAI, AzureOpenAI, Anthropic
 import httpx
 from typing import Literal, Optional, Callable, Generator, Dict, Any, Awaitable, Union, Tuple
 import logging
-import torch
 
 
 async def wrap_done(fn: Awaitable, event: asyncio.Event):
@@ -59,7 +59,6 @@ def get_ChatOpenAI(
     )
     return model
 
-
 def get_OpenAI(
         model_name: str,
         temperature: float,
@@ -153,6 +152,7 @@ class ChatMessage(BaseModel):
 
 def torch_gc():
     try:
+        import torch
         if torch.cuda.is_available():
             # with torch.cuda.device(DEVICE):
             torch.cuda.empty_cache()
@@ -500,58 +500,29 @@ def set_httpx_config(
     # 自动检查torch可用的设备。分布式部署时，不运行LLM的机器上可以不装torch
 
 
-def is_mps_available():
-    return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-
-
-def is_cuda_available():
-    return torch.cuda.is_available()
-
-
 def detect_device() -> Literal["cuda", "mps", "cpu"]:
     try:
+        import torch
         if torch.cuda.is_available():
             return "cuda"
-        if is_mps_available():
+        if torch.backends.mps.is_available():
             return "mps"
     except:
         pass
     return "cpu"
 
 
-def llm_device(device: str = None) -> Literal["cuda", "mps", "cpu", "xpu"]:
+def llm_device(device: str = None) -> Literal["cuda", "mps", "cpu"]:
     device = device or LLM_DEVICE
-    if device not in ["cuda", "mps", "cpu", "xpu"]:
-        logging.warning(f"device not in ['cuda', 'mps', 'cpu','xpu'], device = {device}")
+    if device not in ["cuda", "mps", "cpu"]:
         device = detect_device()
-    elif device == 'cuda' and not is_cuda_available() and is_mps_available():
-        logging.warning("cuda is not available, fallback to mps")
-        return "mps"
-    if device == 'mps' and not is_mps_available() and is_cuda_available():
-        logging.warning("mps is not available, fallback to cuda")
-        return "cuda"
-
-    # auto detect device if not specified
-    if device not in ["cuda", "mps", "cpu", "xpu"]:
-        return detect_device()
     return device
 
 
-def embedding_device(device: str = None) -> Literal["cuda", "mps", "cpu", "xpu"]:
-    device = device or LLM_DEVICE
+def embedding_device(device: str = None) -> Literal["cuda", "mps", "cpu"]:
+    device = device or EMBEDDING_DEVICE
     if device not in ["cuda", "mps", "cpu"]:
-        logging.warning(f"device not in ['cuda', 'mps', 'cpu','xpu'], device = {device}")
         device = detect_device()
-    elif device == 'cuda' and not is_cuda_available() and is_mps_available():
-        logging.warning("cuda is not available, fallback to mps")
-        return "mps"
-    if device == 'mps' and not is_mps_available() and is_cuda_available():
-        logging.warning("mps is not available, fallback to cuda")
-        return "cuda"
-
-    # auto detect device if not specified
-    if device not in ["cuda", "mps", "cpu"]:
-        return detect_device()
     return device