enhance log
This commit is contained in:
parent
6ed7002758
commit
cc706ce7ef
|
|
@ -3,7 +3,7 @@ import os
|
||||||
import langchain
|
import langchain
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
|
||||||
# 是否显示详细日志
|
# 是否显示详细日志
|
||||||
log_verbose = False
|
log_verbose = False
|
||||||
|
|
@ -14,9 +14,12 @@ langchain.verbose = False
|
||||||
# 日志格式
|
# 日志格式
|
||||||
LOG_FORMAT = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
|
LOG_FORMAT = "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
logger.setLevel(logging.INFO)
|
|
||||||
logging.basicConfig(format=LOG_FORMAT)
|
|
||||||
|
|
||||||
|
LOG_BACKUP_COUNT = 10 # 保留的归档文件数量
|
||||||
|
LOG_MAX_FILE_SIZE = 1024 * 1024 # 每个日志文件的最大大小(以字节为单位)
|
||||||
|
|
||||||
|
# 创建日志记录器并设置日志级别
|
||||||
|
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
|
||||||
|
|
||||||
# 日志存储路径
|
# 日志存储路径
|
||||||
LOG_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")
|
LOG_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")
|
||||||
|
|
@ -32,20 +35,14 @@ except Exception:
|
||||||
os.makedirs(BASE_TEMP_DIR, exist_ok=True)
|
os.makedirs(BASE_TEMP_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
#added by weiweiwang for log
|
|
||||||
|
|
||||||
# 创建日志记录器并设置日志级别
|
|
||||||
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
|
|
||||||
|
|
||||||
# 创建文件处理程序,并设置日志级别和文件名
|
# 创建文件处理程序,并设置日志级别和文件名
|
||||||
#appLogPath = os.path.join(LOG_PATH, "app.log")
|
file_handler = RotatingFileHandler(LOG_PATH +'/app.log', maxBytes=LOG_MAX_FILE_SIZE, backupCount=LOG_BACKUP_COUNT)
|
||||||
file_handler = logging.FileHandler(LOG_PATH +'/app.log')
|
|
||||||
file_handler.setLevel(logging.INFO)
|
file_handler.setLevel(logging.INFO)
|
||||||
|
|
||||||
# 设置日志记录格式
|
# # 设置日志记录格式
|
||||||
formatter = logging.Formatter(LOG_FORMAT)
|
formatter = logging.Formatter(LOG_FORMAT)
|
||||||
file_handler.setFormatter(formatter)
|
file_handler.setFormatter(formatter)
|
||||||
|
|
||||||
# 获取日志记录器并添加文件处理程序
|
# 获取日志记录器并添加文件处理程序
|
||||||
appLogger = logging.getLogger(__name__)
|
logger.addHandler(file_handler)
|
||||||
appLogger.addHandler(file_handler)
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from configs import PDF_OCR_THRESHOLD
|
from configs import PDF_OCR_THRESHOLD,logger
|
||||||
from document_loaders.ocr import get_ocr
|
from document_loaders.ocr import get_ocr
|
||||||
#PDF_OCR_THRESHOLD= (0.6,0.6)
|
#PDF_OCR_THRESHOLD= (0.6,0.6)
|
||||||
#from ocr import get_ocr
|
#from ocr import get_ocr
|
||||||
|
|
@ -23,7 +23,7 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
print(f"****page:{i+1}****")
|
print(f"****page:{i+1}****")
|
||||||
text = page.get_text("")
|
text = page.get_text("")
|
||||||
text_lines = text.strip().split("\n")
|
text_lines = text.strip().split("\n")
|
||||||
#print(f"文字内容:{text_lines}")
|
logger.debug(f"文字内容:{text_lines}")
|
||||||
|
|
||||||
img_list = page.get_image_info(xrefs=True)
|
img_list = page.get_image_info(xrefs=True)
|
||||||
ocr_result = []
|
ocr_result = []
|
||||||
|
|
@ -39,7 +39,7 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
result, _ = ocr(img_array)
|
result, _ = ocr(img_array)
|
||||||
if result:
|
if result:
|
||||||
ocr_result = [line[1] for line in result]
|
ocr_result = [line[1] for line in result]
|
||||||
#print(f"图片内容:{ocr_result}")
|
logger.debug(f"图片内容:{ocr_result}")
|
||||||
#resp += "\n".join(ocr_result)
|
#resp += "\n".join(ocr_result)
|
||||||
|
|
||||||
if (len(ocr_result)>0):
|
if (len(ocr_result)>0):
|
||||||
|
|
@ -49,7 +49,7 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
# 假设页码在最后一行
|
# 假设页码在最后一行
|
||||||
if text_lines[-1].isdigit():
|
if text_lines[-1].isdigit():
|
||||||
text = "\n".join(text_lines[:-1])
|
text = "\n".join(text_lines[:-1])
|
||||||
print(f"******去除了页码")
|
logger.debug(f"******去除了页码")
|
||||||
resp += text + "\n"
|
resp += text + "\n"
|
||||||
|
|
||||||
# 更新进度
|
# 更新进度
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from fastapi import Body
|
from fastapi import Body
|
||||||
from configs import logger, log_verbose
|
from configs import logger, log_verbose, logger
|
||||||
from server.utils import BaseResponse
|
from server.utils import BaseResponse
|
||||||
from server.db.repository import feedback_message_to_db
|
from server.db.repository import feedback_message_to_db
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,8 @@ from configs import (LLM_MODELS,
|
||||||
RERANKER_MODEL,
|
RERANKER_MODEL,
|
||||||
RERANKER_MAX_LENGTH,
|
RERANKER_MAX_LENGTH,
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
DOWNLOAD_BASE_URL)
|
DOWNLOAD_BASE_URL,
|
||||||
|
logger)
|
||||||
from server.utils import wrap_done, get_ChatOpenAI
|
from server.utils import wrap_done, get_ChatOpenAI
|
||||||
from server.utils import BaseResponse, get_prompt_template
|
from server.utils import BaseResponse, get_prompt_template
|
||||||
from langchain.chains import LLMChain
|
from langchain.chains import LLMChain
|
||||||
|
|
@ -26,6 +27,8 @@ from urllib.parse import urlencode
|
||||||
from server.knowledge_base.kb_doc_api import search_docs
|
from server.knowledge_base.kb_doc_api import search_docs
|
||||||
from server.reranker.reranker import LangchainReranker
|
from server.reranker.reranker import LangchainReranker
|
||||||
from server.utils import embedding_device
|
from server.utils import embedding_device
|
||||||
|
import time
|
||||||
|
|
||||||
async def knowledge_base_chat(query: str = Body(..., description="用户输入", examples=["你好"]),
|
async def knowledge_base_chat(query: str = Body(..., description="用户输入", examples=["你好"]),
|
||||||
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
||||||
top_k: int = Body(VECTOR_SEARCH_TOP_K, description="匹配向量数"),
|
top_k: int = Body(VECTOR_SEARCH_TOP_K, description="匹配向量数"),
|
||||||
|
|
@ -81,6 +84,7 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
callbacks=[callback],
|
callbacks=[callback],
|
||||||
)
|
)
|
||||||
|
start_time = time.time() # 记录开始时间
|
||||||
docs = search_docs(query, knowledge_base_name, top_k, score_threshold)
|
docs = search_docs(query, knowledge_base_name, top_k, score_threshold)
|
||||||
# docs = await run_in_threadpool(search_docs,
|
# docs = await run_in_threadpool(search_docs,
|
||||||
# query=query,
|
# query=query,
|
||||||
|
|
@ -88,7 +92,14 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
|
||||||
# top_k=top_k,
|
# top_k=top_k,
|
||||||
# score_threshold=score_threshold)
|
# score_threshold=score_threshold)
|
||||||
|
|
||||||
|
end_time = time.time() # 记录结束时间
|
||||||
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
|
logger.info(f"search_docs 耗时{execution_time}秒")
|
||||||
|
|
||||||
# 加入reranker
|
# 加入reranker
|
||||||
|
logger.info(f"use_reranker:{USE_RERANKER}")
|
||||||
|
|
||||||
|
start_time = time.time() # 记录开始时间
|
||||||
if USE_RERANKER:
|
if USE_RERANKER:
|
||||||
reranker_model_path = MODEL_PATH["reranker"].get(RERANKER_MODEL,"BAAI/bge-reranker-large")
|
reranker_model_path = MODEL_PATH["reranker"].get(RERANKER_MODEL,"BAAI/bge-reranker-large")
|
||||||
print("-----------------model path------------------")
|
print("-----------------model path------------------")
|
||||||
|
|
@ -102,6 +113,9 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
|
||||||
query=query)
|
query=query)
|
||||||
print("---------after rerank------------------")
|
print("---------after rerank------------------")
|
||||||
print(docs)
|
print(docs)
|
||||||
|
end_time = time.time() # 记录结束时间
|
||||||
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
|
logger.info(f"reranker 耗时{execution_time}秒")
|
||||||
|
|
||||||
context = "\n".join([doc.page_content for doc in docs])
|
context = "\n".join([doc.page_content for doc in docs])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from configs import USE_RANKING, appLogger
|
from configs import USE_RANKING, logger
|
||||||
import jieba
|
import jieba
|
||||||
from typing import List, Dict,Tuple
|
from typing import List, Dict,Tuple
|
||||||
|
|
||||||
|
|
@ -39,7 +39,7 @@ def search_docs(
|
||||||
data = []
|
data = []
|
||||||
if kb is not None:
|
if kb is not None:
|
||||||
if query:
|
if query:
|
||||||
print(f"search_docs, query:{query}")
|
logger.info(f"search_docs, query:{query},top_k:{top_k},score_threshold:{score_threshold}, use_ranking:{USE_RANKING}")
|
||||||
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
||||||
#print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
|
#print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
|
||||||
|
|
||||||
|
|
@ -355,7 +355,7 @@ def update_docs(
|
||||||
failed_files = {}
|
failed_files = {}
|
||||||
kb_files = []
|
kb_files = []
|
||||||
|
|
||||||
appLogger.info(f"111111 kb_doc_api update_docs file_names:{file_names},更新的doc 长度:{len(docs)}")
|
logger.info(f"111111 kb_doc_api update_docs file_names:{file_names},更新的doc 长度:{len(docs)}")
|
||||||
# 生成需要加载docs的文件列表
|
# 生成需要加载docs的文件列表
|
||||||
for file_name in file_names:
|
for file_name in file_names:
|
||||||
file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name)
|
file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name)
|
||||||
|
|
@ -364,7 +364,7 @@ def update_docs(
|
||||||
continue
|
continue
|
||||||
if file_name not in docs:
|
if file_name not in docs:
|
||||||
try:
|
try:
|
||||||
appLogger.info(f"****kb_doc_api update_docs file_name not in docs,filename:{file_name}")
|
logger.info(f"****kb_doc_api update_docs file_name not in docs,filename:{file_name}")
|
||||||
kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name))
|
kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"加载文档 {file_name} 时出错:{e}"
|
msg = f"加载文档 {file_name} 时出错:{e}"
|
||||||
|
|
@ -391,7 +391,7 @@ def update_docs(
|
||||||
# 将自定义的docs进行向量化
|
# 将自定义的docs进行向量化
|
||||||
for file_name, v in docs.items():
|
for file_name, v in docs.items():
|
||||||
try:
|
try:
|
||||||
appLogger.info(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}")
|
logger.info(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}")
|
||||||
v = [x if isinstance(x, Document) else Document(**x) for x in v]
|
v = [x if isinstance(x, Document) else Document(**x) for x in v]
|
||||||
kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)
|
kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)
|
||||||
kb.update_doc(kb_file, docs=v, not_refresh_vs_cache=True)
|
kb.update_doc(kb_file, docs=v, not_refresh_vs_cache=True)
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ from typing import List, Union, Dict, Optional, Tuple
|
||||||
|
|
||||||
from server.embeddings_api import embed_texts, aembed_texts, embed_documents
|
from server.embeddings_api import embed_texts, aembed_texts, embed_documents
|
||||||
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
from configs import logger,appLogger
|
from configs import logger
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -115,15 +115,15 @@ class KBService(ABC):
|
||||||
custom_docs = True
|
custom_docs = True
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc.metadata.setdefault("source", kb_file.filename)
|
doc.metadata.setdefault("source", kb_file.filename)
|
||||||
appLogger.info(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)},文件名称:{kb_file.filename}")
|
logger.info(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)},文件名称:{kb_file.filename}")
|
||||||
else:
|
else:
|
||||||
docs = kb_file.file2text()
|
docs = kb_file.file2text()
|
||||||
custom_docs = False
|
custom_docs = False
|
||||||
appLogger.info(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)},文件名称:{kb_file.filename}")
|
logger.info(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)},文件名称:{kb_file.filename}")
|
||||||
|
|
||||||
end_time = time.time() # 记录结束时间
|
end_time = time.time() # 记录结束时间
|
||||||
execution_time = end_time - start_time # 计算执行时间
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
appLogger.info(f"add_doc: 加载文件或分块耗时{execution_time}秒")
|
logger.info(f"add_doc: 加载文件或分块耗时{execution_time}秒")
|
||||||
|
|
||||||
start_time = time.time() # 记录开始时间
|
start_time = time.time() # 记录开始时间
|
||||||
if docs:
|
if docs:
|
||||||
|
|
@ -138,11 +138,11 @@ class KBService(ABC):
|
||||||
rel_path = Path(source).relative_to(self.doc_path)
|
rel_path = Path(source).relative_to(self.doc_path)
|
||||||
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
|
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.info(f"cannot convert absolute path ({source}) to relative path. error is : {e}")
|
logger.info(f"cannot convert absolute path ({source}) to relative path. error is : {e}")
|
||||||
self.delete_doc(kb_file)
|
self.delete_doc(kb_file)
|
||||||
#appLogger.info(f"add_doc filepath:{kb_file.filepath},将要执行do_add_doc")
|
#logger.info(f"add_doc filepath:{kb_file.filepath},将要执行do_add_doc")
|
||||||
doc_infos = self.do_add_doc(docs, **kwargs)
|
doc_infos = self.do_add_doc(docs, **kwargs)
|
||||||
#appLogger.info(f"add_doc filepath:{kb_file.filepath} 将要执行dd_file_to_db")
|
#logger.info(f"add_doc filepath:{kb_file.filepath} 将要执行dd_file_to_db")
|
||||||
status = add_file_to_db(kb_file,
|
status = add_file_to_db(kb_file,
|
||||||
custom_docs=custom_docs,
|
custom_docs=custom_docs,
|
||||||
docs_count=len(docs),
|
docs_count=len(docs),
|
||||||
|
|
@ -150,7 +150,7 @@ class KBService(ABC):
|
||||||
|
|
||||||
end_time = time.time() # 记录结束时间
|
end_time = time.time() # 记录结束时间
|
||||||
execution_time = end_time - start_time # 计算执行时间
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
appLogger.info(f"add_doc: 入库耗时:{execution_time}秒")
|
logger.info(f"add_doc: 入库耗时:{execution_time}秒")
|
||||||
else:
|
else:
|
||||||
status = False
|
status = False
|
||||||
return status
|
return status
|
||||||
|
|
@ -251,7 +251,7 @@ class KBService(ABC):
|
||||||
通过file_name或metadata检索Document
|
通过file_name或metadata检索Document
|
||||||
'''
|
'''
|
||||||
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
|
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
|
||||||
#appLogger.info(f"kb_doc_api list_docs_from_db: {doc_infos}")
|
#logger.info(f"kb_doc_api list_docs_from_db: {doc_infos}")
|
||||||
docs = []
|
docs = []
|
||||||
for x in doc_infos:
|
for x in doc_infos:
|
||||||
doc_info = self.get_doc_by_ids([x["id"]])
|
doc_info = self.get_doc_by_ids([x["id"]])
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from server.knowledge_base.kb_service.base import KBService, SupportedVSType
|
||||||
from server.knowledge_base.utils import KnowledgeFile
|
from server.knowledge_base.utils import KnowledgeFile
|
||||||
from server.utils import load_local_embeddings
|
from server.utils import load_local_embeddings
|
||||||
from elasticsearch import Elasticsearch,BadRequestError
|
from elasticsearch import Elasticsearch,BadRequestError
|
||||||
from configs import logger,appLogger
|
from configs import logger
|
||||||
from configs import kbs_config
|
from configs import kbs_config
|
||||||
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
|
|
||||||
|
|
@ -30,13 +30,13 @@ class ESKBService(KBService):
|
||||||
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}",
|
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}",
|
||||||
basic_auth=(self.user,self.password))
|
basic_auth=(self.user,self.password))
|
||||||
else:
|
else:
|
||||||
appLogger.warning("ES未配置用户名和密码")
|
logger.warning("ES未配置用户名和密码")
|
||||||
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}")
|
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}")
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
appLogger.error("连接到 Elasticsearch 失败!")
|
logger.error("连接到 Elasticsearch 失败!")
|
||||||
raise ConnectionError
|
raise ConnectionError
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"Error 发生 : {e}")
|
logger.error(f"Error 发生 : {e}")
|
||||||
raise e
|
raise e
|
||||||
try:
|
try:
|
||||||
# 首先尝试通过es_client_python创建
|
# 首先尝试通过es_client_python创建
|
||||||
|
|
@ -51,8 +51,8 @@ class ESKBService(KBService):
|
||||||
}
|
}
|
||||||
self.es_client_python.indices.create(index=self.index_name, mappings=mappings)
|
self.es_client_python.indices.create(index=self.index_name, mappings=mappings)
|
||||||
except BadRequestError as e:
|
except BadRequestError as e:
|
||||||
appLogger.error("创建索引失败,重新")
|
logger.error("创建索引失败,重新")
|
||||||
appLogger.error(e)
|
logger.error(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# langchain ES 连接、创建索引
|
# langchain ES 连接、创建索引
|
||||||
|
|
@ -67,7 +67,7 @@ class ESKBService(KBService):
|
||||||
es_password=self.password
|
es_password=self.password
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
appLogger.warning("ES未配置用户名和密码")
|
logger.warning("ES未配置用户名和密码")
|
||||||
self.db_init = ElasticsearchStore(
|
self.db_init = ElasticsearchStore(
|
||||||
es_url=f"http://{self.IP}:{self.PORT}",
|
es_url=f"http://{self.IP}:{self.PORT}",
|
||||||
index_name=self.index_name,
|
index_name=self.index_name,
|
||||||
|
|
@ -77,10 +77,10 @@ class ESKBService(KBService):
|
||||||
)
|
)
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
print("### 初始化 Elasticsearch 失败!")
|
print("### 初始化 Elasticsearch 失败!")
|
||||||
appLogger.error("### 初始化 Elasticsearch 失败!")
|
logger.error("### 初始化 Elasticsearch 失败!")
|
||||||
raise ConnectionError
|
raise ConnectionError
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"Error 发生 : {e}")
|
logger.error(f"Error 发生 : {e}")
|
||||||
raise e
|
raise e
|
||||||
try:
|
try:
|
||||||
# 尝试通过db_init创建索引
|
# 尝试通过db_init创建索引
|
||||||
|
|
@ -89,8 +89,8 @@ class ESKBService(KBService):
|
||||||
dims_length=self.dims_length
|
dims_length=self.dims_length
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error("创建索引失败...")
|
logger.error("创建索引失败...")
|
||||||
appLogger.error(e)
|
logger.error(e)
|
||||||
# raise e
|
# raise e
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -156,23 +156,22 @@ class ESKBService(KBService):
|
||||||
except ConnectionError as ce:
|
except ConnectionError as ce:
|
||||||
print(ce)
|
print(ce)
|
||||||
print("连接到 Elasticsearch 失败!")
|
print("连接到 Elasticsearch 失败!")
|
||||||
appLogger.error("连接到 Elasticsearch 失败!")
|
logger.error("连接到 Elasticsearch 失败!")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"Error 发生 : {e}")
|
logger.error(f"Error 发生 : {e}")
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def do_search(self, query:str, top_k: int, score_threshold: float):
|
def do_search(self, query:str, top_k: int, score_threshold: float):
|
||||||
# 文本相似性检索
|
# 文本相似性检索
|
||||||
print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}")
|
|
||||||
docs = self.db_init.similarity_search_with_score(query=query,
|
docs = self.db_init.similarity_search_with_score(query=query,
|
||||||
k=top_k)
|
k=top_k)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def searchbyContent(self, query:str, top_k: int = 2):
|
def searchbyContent(self, query:str, top_k: int = 2):
|
||||||
if self.es_client_python.indices.exists(index=self.index_name):
|
if self.es_client_python.indices.exists(index=self.index_name):
|
||||||
appLogger.info(f"******ESKBService searchByContent {self.index_name},query:{query}")
|
logger.info(f"******ESKBService searchByContent {self.index_name},query:{query}")
|
||||||
tem_query = {
|
tem_query = {
|
||||||
"query": {"match": {
|
"query": {"match": {
|
||||||
"context": "*" + query + "*"
|
"context": "*" + query + "*"
|
||||||
|
|
@ -199,7 +198,7 @@ class ESKBService(KBService):
|
||||||
|
|
||||||
def searchbyContentInternal(self, query:str, top_k: int = 2):
|
def searchbyContentInternal(self, query:str, top_k: int = 2):
|
||||||
if self.es_client_python.indices.exists(index=self.index_name):
|
if self.es_client_python.indices.exists(index=self.index_name):
|
||||||
appLogger.info(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
|
logger.info(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
|
||||||
tem_query = {
|
tem_query = {
|
||||||
"query": {"match": {
|
"query": {"match": {
|
||||||
"context": "*" + query + "*"
|
"context": "*" + query + "*"
|
||||||
|
|
@ -231,19 +230,19 @@ class ESKBService(KBService):
|
||||||
metadata=result["_source"]["metadata"],
|
metadata=result["_source"]["metadata"],
|
||||||
))
|
))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"ES Docs Get Error! {e}")
|
logger.error(f"ES Docs Get Error! {e}")
|
||||||
return result_list
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
def del_doc_by_ids(self,ids: List[str]) -> bool:
|
def del_doc_by_ids(self,ids: List[str]) -> bool:
|
||||||
appLogger.info(f"es_kb_service del_doc_by_ids")
|
logger.info(f"es_kb_service del_doc_by_ids")
|
||||||
for doc_id in ids:
|
for doc_id in ids:
|
||||||
try:
|
try:
|
||||||
self.es_client_python.delete(index=self.index_name,
|
self.es_client_python.delete(index=self.index_name,
|
||||||
id=doc_id,
|
id=doc_id,
|
||||||
refresh=True)
|
refresh=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"ES Docs Delete Error! {e}")
|
logger.error(f"ES Docs Delete Error! {e}")
|
||||||
|
|
||||||
|
|
||||||
def do_delete_doc(self, kb_file, **kwargs):
|
def do_delete_doc(self, kb_file, **kwargs):
|
||||||
|
|
@ -272,7 +271,7 @@ class ESKBService(KBService):
|
||||||
id=doc_id,
|
id=doc_id,
|
||||||
refresh=True)
|
refresh=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"ES Docs Delete Error! {e}")
|
logger.error(f"ES Docs Delete Error! {e}")
|
||||||
|
|
||||||
# self.db_init.delete(ids=delete_list)
|
# self.db_init.delete(ids=delete_list)
|
||||||
#self.es_client_python.indices.refresh(index=self.index_name)
|
#self.es_client_python.indices.refresh(index=self.index_name)
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from fastapi import Body
|
from fastapi import Body
|
||||||
from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
|
from configs import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
|
||||||
OVERLAP_SIZE,
|
OVERLAP_SIZE,
|
||||||
logger, log_verbose, )
|
logger, log_verbose )
|
||||||
from server.knowledge_base.utils import (list_files_from_folder)
|
from server.knowledge_base.utils import (list_files_from_folder)
|
||||||
from sse_starlette import EventSourceResponse
|
from sse_starlette import EventSourceResponse
|
||||||
import json
|
import json
|
||||||
|
|
@ -10,7 +10,7 @@ from typing import List, Optional
|
||||||
from server.knowledge_base.kb_summary.base import KBSummaryService
|
from server.knowledge_base.kb_summary.base import KBSummaryService
|
||||||
from server.knowledge_base.kb_summary.summary_chunk import SummaryAdapter
|
from server.knowledge_base.kb_summary.summary_chunk import SummaryAdapter
|
||||||
from server.utils import wrap_done, get_ChatOpenAI, BaseResponse
|
from server.utils import wrap_done, get_ChatOpenAI, BaseResponse
|
||||||
from configs import LLM_MODELS, TEMPERATURE
|
from configs import LLM_MODELS, TEMPERATURE, logger
|
||||||
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
|
|
||||||
def recreate_summary_vector_store(
|
def recreate_summary_vector_store(
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,7 @@ def import_from_db(
|
||||||
con.close()
|
con.close()
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"无法读取备份数据库:{sqlite_path}。错误信息:{e}")
|
logger.error(f"无法读取备份数据库:{sqlite_path}。错误信息:{e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ from configs import (
|
||||||
OVERLAP_SIZE,
|
OVERLAP_SIZE,
|
||||||
ZH_TITLE_ENHANCE,
|
ZH_TITLE_ENHANCE,
|
||||||
logger,
|
logger,
|
||||||
appLogger,
|
|
||||||
log_verbose,
|
log_verbose,
|
||||||
text_splitter_dict,
|
text_splitter_dict,
|
||||||
LLM_MODELS,
|
LLM_MODELS,
|
||||||
|
|
@ -95,7 +94,7 @@ def list_files_from_folder(kb_name: str):
|
||||||
process_entry(entry)
|
process_entry(entry)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
appLogger.error(f"Error 发生 : {e}")
|
logger.error(f"Error 发生 : {e}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -176,7 +175,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None):
|
||||||
DocumentLoader = getattr(document_loaders_module, loader_name)
|
DocumentLoader = getattr(document_loaders_module, loader_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"为文件{file_path}查找加载器{loader_name}时出错:{e}"
|
msg = f"为文件{file_path}查找加载器{loader_name}时出错:{e}"
|
||||||
appLogger.error(f'{e.__class__.__name__}: {msg}',
|
logger.error(f'{e.__class__.__name__}: {msg}',
|
||||||
exc_info=e if log_verbose else None)
|
exc_info=e if log_verbose else None)
|
||||||
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
||||||
DocumentLoader = getattr(document_loaders_module, "UnstructuredFileLoader")
|
DocumentLoader = getattr(document_loaders_module, "UnstructuredFileLoader")
|
||||||
|
|
@ -315,14 +314,15 @@ class KnowledgeFile:
|
||||||
|
|
||||||
def file2docs(self, refresh: bool = False):
|
def file2docs(self, refresh: bool = False):
|
||||||
if self.docs is None or refresh:
|
if self.docs is None or refresh:
|
||||||
appLogger.info(f"{self.document_loader_name} used for {self.filepath}")
|
logger.info(f"{self.document_loader_name} used for {self.filepath}")
|
||||||
loader = get_loader(loader_name=self.document_loader_name,
|
loader = get_loader(loader_name=self.document_loader_name,
|
||||||
file_path=self.filepath,
|
file_path=self.filepath,
|
||||||
loader_kwargs=self.loader_kwargs)
|
loader_kwargs=self.loader_kwargs)
|
||||||
self.docs = loader.load()
|
self.docs = loader.load()
|
||||||
|
logger.info(f"{self.filepath}加载完成")
|
||||||
return self.docs
|
return self.docs
|
||||||
|
|
||||||
print(f"KnowledgeFile: filepath:{self.filepath}, doc_title_name:{self.doc_title_name}, ext:{self.ext}")
|
#print(f"KnowledgeFile: filepath:{self.filepath}, doc_title_name:{self.doc_title_name}, ext:{self.ext}")
|
||||||
|
|
||||||
def docs2texts(
|
def docs2texts(
|
||||||
self,
|
self,
|
||||||
|
|
@ -347,7 +347,7 @@ class KnowledgeFile:
|
||||||
if doc.page_content.strip()!="":
|
if doc.page_content.strip()!="":
|
||||||
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||||
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||||
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
logger.info(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
||||||
if not docs:
|
if not docs:
|
||||||
return []
|
return []
|
||||||
if self.ext not in [".csv"]:
|
if self.ext not in [".csv"]:
|
||||||
|
|
@ -437,10 +437,11 @@ def files2docs_in_thread(
|
||||||
|
|
||||||
def file2docs(*, file: KnowledgeFile, **kwargs) -> Tuple[bool, Tuple[str, str, List[Document]]]:
|
def file2docs(*, file: KnowledgeFile, **kwargs) -> Tuple[bool, Tuple[str, str, List[Document]]]:
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"file2docs 从文件 {file.kb_name}/{file.filename}")
|
||||||
return True, (file.kb_name, file.filename, file.file2text(**kwargs))
|
return True, (file.kb_name, file.filename, file.file2text(**kwargs))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}"
|
msg = f"file2docs 从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}"
|
||||||
appLogger.error(f'{e.__class__.__name__}: {msg}',
|
logger.error(f'{e.__class__.__name__}: {msg}',
|
||||||
exc_info=e if log_verbose else None)
|
exc_info=e if log_verbose else None)
|
||||||
return False, (file.kb_name, file.filename, msg)
|
return False, (file.kb_name, file.filename, msg)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from fastapi import Body
|
from fastapi import Body
|
||||||
from configs import logger, log_verbose, LLM_MODELS, HTTPX_DEFAULT_TIMEOUT
|
from configs import logger, log_verbose, LLM_MODELS, HTTPX_DEFAULT_TIMEOUT, logger
|
||||||
from server.utils import (BaseResponse, fschat_controller_address, list_config_llm_models,
|
from server.utils import (BaseResponse, fschat_controller_address, list_config_llm_models,
|
||||||
get_httpx_client, get_model_worker_config)
|
get_httpx_client, get_model_worker_config)
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
|
||||||
34
startup.py
34
startup.py
|
|
@ -566,36 +566,36 @@ def dump_server_info(after_start=False, args=None):
|
||||||
import fastchat
|
import fastchat
|
||||||
from server.utils import api_address, webui_address
|
from server.utils import api_address, webui_address
|
||||||
|
|
||||||
print("\n")
|
logger.info("\n")
|
||||||
print("=" * 30 + "Langchain-Chatchat Configuration" + "=" * 30)
|
logger.info("=" * 30 + "Langchain-Chatchat Configuration" + "=" * 30)
|
||||||
print(f"操作系统:{platform.platform()}.")
|
logger.info(f"操作系统:{platform.platform()}")
|
||||||
print(f"python版本:{sys.version}")
|
logger.info(f"python版本:{sys.version}")
|
||||||
print(f"项目版本:{VERSION}")
|
logger.info(f"项目版本:{VERSION}")
|
||||||
print(f"langchain版本:{langchain.__version__}. fastchat版本:{fastchat.__version__}")
|
logger.info(f"langchain版本:{langchain.__version__}. fastchat版本:{fastchat.__version__}")
|
||||||
print("\n")
|
logger.info("\n")
|
||||||
|
|
||||||
models = LLM_MODELS
|
models = LLM_MODELS
|
||||||
if args and args.model_name:
|
if args and args.model_name:
|
||||||
models = args.model_name
|
models = args.model_name
|
||||||
|
|
||||||
print(f"当前使用的分词器:{TEXT_SPLITTER_NAME}")
|
logger.info(f"当前使用的分词器:{TEXT_SPLITTER_NAME}")
|
||||||
print(f"当前启动的LLM模型:{models} @ {llm_device()}")
|
logger.info(f"当前启动的LLM模型:{models} @ {llm_device()}")
|
||||||
|
|
||||||
for model in models:
|
for model in models:
|
||||||
pprint(get_model_worker_config(model))
|
pprint(get_model_worker_config(model))
|
||||||
print(f"当前Embbedings模型: {EMBEDDING_MODEL} @ {embedding_device()}")
|
logger.info(f"当前Embbedings模型: {EMBEDDING_MODEL} @ {embedding_device()}")
|
||||||
|
|
||||||
if after_start:
|
if after_start:
|
||||||
print("\n")
|
logger.info("\n")
|
||||||
print(f"服务端运行信息:")
|
logger.info(f"服务端运行信息:")
|
||||||
if args.openai_api:
|
if args.openai_api:
|
||||||
print(f" OpenAI API Server: {fschat_openai_api_address()}")
|
logger.info(f" OpenAI API Server: {fschat_openai_api_address()}")
|
||||||
if args.api:
|
if args.api:
|
||||||
print(f" Chatchat API Server: {api_address()}")
|
logger.info(f" Chatchat API Server: {api_address()}")
|
||||||
if args.webui:
|
if args.webui:
|
||||||
print(f" Chatchat WEBUI Server: {webui_address()}")
|
logger.info(f" Chatchat WEBUI Server: {webui_address()}")
|
||||||
print("=" * 30 + "Langchain-Chatchat Configuration" + "=" * 30)
|
logger.info("=" * 30 + "Langchain-Chatchat Configuration" + "=" * 30)
|
||||||
print("\n")
|
logger.info("\n")
|
||||||
|
|
||||||
|
|
||||||
async def start_main_server():
|
async def start_main_server():
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
import re
|
import re
|
||||||
|
from configs import logger
|
||||||
|
|
||||||
def get_fist_level_title(
|
def get_fist_level_title(
|
||||||
text: str,
|
text: str,
|
||||||
|
|
@ -181,11 +182,11 @@ def zh_second_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
#print(f"zh_second_title_enhance: {doc}")
|
logger.debug(f"zh_second_title_enhance: {doc}")
|
||||||
second_title = get_second_level_title(doc.page_content)
|
second_title = get_second_level_title(doc.page_content)
|
||||||
if second_title:
|
if second_title:
|
||||||
title = second_title
|
title = second_title
|
||||||
#print(f"title: {title}")
|
logger.debug(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
#print(f"title is not none")
|
#print(f"title is not none")
|
||||||
temp_third_content = is_third_level_content(doc.page_content)
|
temp_third_content = is_third_level_content(doc.page_content)
|
||||||
|
|
@ -194,7 +195,7 @@ def zh_second_title_enhance(docs: Document) -> Document:
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
print(f"final title: {title}")
|
logger.debug(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_second_title_enhance 文件不存在")
|
print("zh_second_title_enhance 文件不存在")
|
||||||
|
|
@ -204,19 +205,19 @@ def zh_first_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
#print(f"zh_first_title_enhance: {doc}")
|
logger.debug(f"zh_first_title_enhance: {doc}")
|
||||||
first_title = get_fist_level_title(doc.page_content)
|
first_title = get_fist_level_title(doc.page_content)
|
||||||
if first_title:
|
if first_title:
|
||||||
title = first_title
|
title = first_title
|
||||||
#print(f"title: {title}")
|
logger.debug(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
temp_second_content = is_second_level_content(doc.page_content)
|
temp_second_content = is_second_level_content(doc.page_content)
|
||||||
if temp_second_content:
|
if temp_second_content:
|
||||||
#print(f"is_second_level_content : {temp_second_content}")
|
logger.debug(f"is_second_level_content : {temp_second_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
#print(f"final title: {title}")
|
logger.debug(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_first_title_enhance 文件不存在")
|
print("zh_first_title_enhance 文件不存在")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue