diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index 0341edd..080593e 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -19,7 +19,7 @@ from server.knowledge_base.model.kb_document_model import DocumentWithVSId from typing import List, Dict from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity -from configs import USE_RANKING +from configs import USE_RANKING, appLogger import jieba from typing import List, Dict,Tuple @@ -41,12 +41,12 @@ def search_docs( if query: print(f"search_docs, query:{query}") docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold) - print(f"search_docs,len of docs {len(docs)}, docs:{docs}") + #print(f"search_docs,len of docs {len(docs)}, docs:{docs}") docs_key = kb.search_content_internal(query,2) - print(f"search_content_internal, len of docs {len(docs_key)}, docs:{docs_key}") + #print(f"search_content_internal, len of docs {len(docs_key)}, docs:{docs_key}") docs = merge_and_deduplicate(docs, docs_key) - print(f"after merge_and_deduplicate, len of docs: {len(docs)}, docs:{docs}") + #print(f"after merge_and_deduplicate, len of docs: {len(docs)}, docs:{docs}") if USE_RANKING: queryList = [] queryList.append(query) @@ -57,16 +57,16 @@ def search_docs( vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(doc_contents) - print(f"****** search_docs, tfidf_matrix:{tfidf_matrix}") + #print(f"****** search_docs, tfidf_matrix:{tfidf_matrix}") query_vector = vectorizer.transform(queryList) - print(f"****** search_docs, query_vector:{query_vector}") + #print(f"****** search_docs, query_vector:{query_vector}") cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten() - print(f"****** search_docs, cosine_similarities:{cosine_similarities}") + #print(f"****** search_docs, cosine_similarities:{cosine_similarities}") # 将相似度分数与文档结合 docs_with_scores = [(doc, score) for doc, score in zip(docs, cosine_similarities)] sorted_docs = sorted(docs_with_scores, key=lambda x: x[1], reverse=True) - print(f"****** search_docs, sorted_docs:{sorted_docs}") + #print(f"****** search_docs, sorted_docs:{sorted_docs}") i = 0 for doc in sorted_docs: if i>=top_k: @@ -74,7 +74,7 @@ def search_docs( else: data.append(DocumentWithVSId(page_content = doc[0][0].page_content,id=doc[0][0].metadata.get("id"), score=doc[0][1],metadata=doc[0][0].metadata)) i = i+1 - print(f"****** search_docs top K , sorted_docs:{data}") + #print(f"****** search_docs top K , sorted_docs:{data}") else: data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs] @@ -355,7 +355,7 @@ def update_docs( failed_files = {} kb_files = [] - print(f"111111 kb_doc_api update_docs file_name:{file_names},更新的doc 长度:{len(docs)}") + appLogger.info(f"111111 kb_doc_api update_docs file_names:{file_names},更新的doc 长度:{len(docs)}") # 生成需要加载docs的文件列表 for file_name in file_names: file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name) @@ -363,40 +363,35 @@ def update_docs( if file_detail.get("custom_docs") and not override_custom_docs: continue if file_name not in docs: - print(f"****kb_doc_api update_docs file_name not in docs") try: + appLogger.info(f"****kb_doc_api update_docs file_name not in docs,filename:{file_name}") kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)) - - # 从文件生成docs,并进行向量化。 - # 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile - for status, result in files2docs_in_thread(kb_files, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - zh_title_enhance=zh_title_enhance): - if status: - print(f"kb_doc_api update_docs 文件生成docs并向量化,filename:{file_name}") - kb_name, file_name, new_docs = result - kb_file = KnowledgeFile(filename=file_name, - knowledge_base_name=knowledge_base_name) - kb_file.splited_docs = new_docs - kb.update_doc(kb_file, not_refresh_vs_cache=True) - else: - kb_name, file_name, error = result - failed_files[file_name] = error - except Exception as e: msg = f"加载文档 {file_name} 时出错:{e}" logger.error(f'{e.__class__.__name__}: {msg}', exc_info=e if log_verbose else None) failed_files[file_name] = msg + + # 从文件生成docs,并进行向量化。 + # 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile + for status, result in files2docs_in_thread(kb_files, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): + if status: + kb_name, file_name, new_docs = result + kb_file = KnowledgeFile(filename=file_name, + knowledge_base_name=knowledge_base_name) + kb_file.splited_docs = new_docs + kb.update_doc(kb_file, not_refresh_vs_cache=True) else: - print(f"****kb_doc_api update_docs file_name in docs") + kb_name, file_name, error = result + failed_files[file_name] = error # 将自定义的docs进行向量化 for file_name, v in docs.items(): - print(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}") try: - print(f"kb_doc_api update_docs 自定义的docs 向量化,filename:{file_name}") + appLogger.info(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}") v = [x if isinstance(x, Document) else Document(**x) for x in v] kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name) kb.update_doc(kb_file, docs=v, not_refresh_vs_cache=True) @@ -409,10 +404,8 @@ def update_docs( if not not_refresh_vs_cache: kb.save_vector_store() - print(f"kb_doc_api update_docs before finishing, failed_files:{failed_files}") return BaseResponse(code=200, msg=f"更新文档完成", data={"failed_files": failed_files}) - def download_doc( knowledge_base_name: str = Query(..., description="知识库名称", examples=["samples"]), file_name: str = Query(..., description="文件名称", examples=["test.txt"]), diff --git a/server/knowledge_base/kb_service/base.py b/server/knowledge_base/kb_service/base.py index 980422f..31ce3fe 100644 --- a/server/knowledge_base/kb_service/base.py +++ b/server/knowledge_base/kb_service/base.py @@ -28,6 +28,8 @@ from typing import List, Union, Dict, Optional, Tuple from server.embeddings_api import embed_texts, aembed_texts, embed_documents from server.knowledge_base.model.kb_document_model import DocumentWithVSId +from configs import logger,appLogger +import time def normalize(embeddings: List[List[float]]) -> np.ndarray: @@ -108,16 +110,22 @@ class KBService(ABC): 向知识库添加文件 如果指定了docs,则不再将文本向量化,并将数据库对应条目标为custom_docs=True """ + start_time = time.time() # 记录开始时间 if docs: custom_docs = True for doc in docs: doc.metadata.setdefault("source", kb_file.filename) - print(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)}") + appLogger.info(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)},文件名称:{kb_file.filename}") else: docs = kb_file.file2text() custom_docs = False - print(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)}") + appLogger.info(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)},文件名称:{kb_file.filename}") + end_time = time.time() # 记录结束时间 + execution_time = end_time - start_time # 计算执行时间 + appLogger.info(f"add_doc: 加载文件或分块耗时{execution_time}秒") + + start_time = time.time() # 记录开始时间 if docs: # 将 metadata["source"] 改为相对路径 for doc in docs: @@ -130,15 +138,19 @@ class KBService(ABC): rel_path = Path(source).relative_to(self.doc_path) doc.metadata["source"] = str(rel_path.as_posix().strip("/")) except Exception as e: - print(f"cannot convert absolute path ({source}) to relative path. error is : {e}") + appLogger.info(f"cannot convert absolute path ({source}) to relative path. error is : {e}") self.delete_doc(kb_file) - print(f"add_doc filepath:{kb_file.filepath},将要执行do_add_doc") + #appLogger.info(f"add_doc filepath:{kb_file.filepath},将要执行do_add_doc") doc_infos = self.do_add_doc(docs, **kwargs) - print(f"add_doc filepath:{kb_file.filepath} 将要执行dd_file_to_db") + #appLogger.info(f"add_doc filepath:{kb_file.filepath} 将要执行dd_file_to_db") status = add_file_to_db(kb_file, custom_docs=custom_docs, docs_count=len(docs), doc_infos=doc_infos) + + end_time = time.time() # 记录结束时间 + execution_time = end_time - start_time # 计算执行时间 + appLogger.info(f"add_doc: 入库耗时:{execution_time}秒") else: status = False return status @@ -214,7 +226,7 @@ class KBService(ABC): def del_doc_by_ids_from_db(self, knowledge_base_name: str , file_name:str, ids: List[str]) -> bool: delete_docs_from_db_by_ids(ids) update_file_to_db(knowledge_base_name = knowledge_base_name,file_name = file_name) - print(f"*******KBService del_doc_by_ids_from_db") + #print(f"*******KBService del_doc_by_ids_from_db") return True @@ -239,7 +251,7 @@ class KBService(ABC): 通过file_name或metadata检索Document ''' doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata) - print(f"kb_doc_api list_docs_from_db: {doc_infos}") + #appLogger.info(f"kb_doc_api list_docs_from_db: {doc_infos}") docs = [] for x in doc_infos: doc_info = self.get_doc_by_ids([x["id"]]) @@ -257,7 +269,7 @@ class KBService(ABC): else: # 处理 doc_info 是 NoneType 或者不是列表的情况 # 可以选择跳过当前循环迭代或执行其他操作 - print("base.py list_docs 返回为空") + #print("base.py list_docs 返回为空") pass return docs diff --git a/server/knowledge_base/kb_service/es_kb_service.py b/server/knowledge_base/kb_service/es_kb_service.py index 2ef1872..d4a5109 100644 --- a/server/knowledge_base/kb_service/es_kb_service.py +++ b/server/knowledge_base/kb_service/es_kb_service.py @@ -9,7 +9,7 @@ from server.knowledge_base.kb_service.base import KBService, SupportedVSType from server.knowledge_base.utils import KnowledgeFile from server.utils import load_local_embeddings from elasticsearch import Elasticsearch,BadRequestError -from configs import logger +from configs import logger,appLogger from configs import kbs_config from server.knowledge_base.model.kb_document_model import DocumentWithVSId @@ -30,13 +30,13 @@ class ESKBService(KBService): self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}", basic_auth=(self.user,self.password)) else: - logger.warning("ES未配置用户名和密码") + appLogger.warning("ES未配置用户名和密码") self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}") except ConnectionError: - logger.error("连接到 Elasticsearch 失败!") + appLogger.error("连接到 Elasticsearch 失败!") raise ConnectionError except Exception as e: - logger.error(f"Error 发生 : {e}") + appLogger.error(f"Error 发生 : {e}") raise e try: # 首先尝试通过es_client_python创建 @@ -51,8 +51,8 @@ class ESKBService(KBService): } self.es_client_python.indices.create(index=self.index_name, mappings=mappings) except BadRequestError as e: - logger.error("创建索引失败,重新") - logger.error(e) + appLogger.error("创建索引失败,重新") + appLogger.error(e) try: # langchain ES 连接、创建索引 @@ -67,7 +67,7 @@ class ESKBService(KBService): es_password=self.password ) else: - logger.warning("ES未配置用户名和密码") + appLogger.warning("ES未配置用户名和密码") self.db_init = ElasticsearchStore( es_url=f"http://{self.IP}:{self.PORT}", index_name=self.index_name, @@ -77,10 +77,10 @@ class ESKBService(KBService): ) except ConnectionError: print("### 初始化 Elasticsearch 失败!") - logger.error("### 初始化 Elasticsearch 失败!") + appLogger.error("### 初始化 Elasticsearch 失败!") raise ConnectionError except Exception as e: - logger.error(f"Error 发生 : {e}") + appLogger.error(f"Error 发生 : {e}") raise e try: # 尝试通过db_init创建索引 @@ -89,8 +89,8 @@ class ESKBService(KBService): dims_length=self.dims_length ) except Exception as e: - logger.error("创建索引失败...") - logger.error(e) + appLogger.error("创建索引失败...") + appLogger.error(e) # raise e @@ -156,9 +156,9 @@ class ESKBService(KBService): except ConnectionError as ce: print(ce) print("连接到 Elasticsearch 失败!") - logger.error("连接到 Elasticsearch 失败!") + appLogger.error("连接到 Elasticsearch 失败!") except Exception as e: - logger.error(f"Error 发生 : {e}") + appLogger.error(f"Error 发生 : {e}") print(e) @@ -172,7 +172,7 @@ class ESKBService(KBService): def searchbyContent(self, query:str, top_k: int = 2): if self.es_client_python.indices.exists(index=self.index_name): - print(f"******ESKBService searchByContent {self.index_name},query:{query}") + appLogger.info(f"******ESKBService searchByContent {self.index_name},query:{query}") tem_query = { "query": {"match": { "context": "*" + query + "*" @@ -199,7 +199,7 @@ class ESKBService(KBService): def searchbyContentInternal(self, query:str, top_k: int = 2): if self.es_client_python.indices.exists(index=self.index_name): - print(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}") + appLogger.info(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}") tem_query = { "query": {"match": { "context": "*" + query + "*" @@ -231,19 +231,19 @@ class ESKBService(KBService): metadata=result["_source"]["metadata"], )) except Exception as e: - logger.error(f"ES Docs Get Error! {e}") + appLogger.error(f"ES Docs Get Error! {e}") return result_list def del_doc_by_ids(self,ids: List[str]) -> bool: - print(f"es_kb_service del_doc_by_ids") + appLogger.info(f"es_kb_service del_doc_by_ids") for doc_id in ids: try: self.es_client_python.delete(index=self.index_name, id=doc_id, refresh=True) except Exception as e: - logger.error(f"ES Docs Delete Error! {e}") + appLogger.error(f"ES Docs Delete Error! {e}") def do_delete_doc(self, kb_file, **kwargs): @@ -262,7 +262,7 @@ class ESKBService(KBService): search_results = self.es_client_python.search(index=self.index_name, body=query,size=200) delete_list = [hit["_id"] for hit in search_results['hits']['hits']] size = len(delete_list) - print(f"***do_delete_doc: 删除的size:{size}, {delete_list}") + #print(f"***do_delete_doc: 删除的size:{size}, {delete_list}") if len(delete_list) == 0: return None else: @@ -272,7 +272,7 @@ class ESKBService(KBService): id=doc_id, refresh=True) except Exception as e: - logger.error(f"ES Docs Delete Error! {e}") + appLogger.error(f"ES Docs Delete Error! {e}") # self.db_init.delete(ids=delete_list) #self.es_client_python.indices.refresh(index=self.index_name) @@ -300,8 +300,8 @@ class ESKBService(KBService): if len(search_results["hits"]["hits"]) == 0: raise ValueError("召回元素个数为0") info_docs = [{"id":hit["_id"], "metadata": hit["_source"]["metadata"]} for hit in search_results["hits"]["hits"]] - size = len(info_docs) - print(f"do_add_doc 召回元素个数:{size}") + #size = len(info_docs) + #print(f"do_add_doc 召回元素个数:{size}") return info_docs diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index 2c465f8..96b25bc 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -5,6 +5,7 @@ from configs import ( OVERLAP_SIZE, ZH_TITLE_ENHANCE, logger, + appLogger, log_verbose, text_splitter_dict, LLM_MODELS, @@ -94,7 +95,7 @@ def list_files_from_folder(kb_name: str): process_entry(entry) except Exception as e: - logger.error(f"Error 发生 : {e}") + appLogger.error(f"Error 发生 : {e}") return result @@ -175,7 +176,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None): DocumentLoader = getattr(document_loaders_module, loader_name) except Exception as e: msg = f"为文件{file_path}查找加载器{loader_name}时出错:{e}" - logger.error(f'{e.__class__.__name__}: {msg}', + appLogger.error(f'{e.__class__.__name__}: {msg}', exc_info=e if log_verbose else None) document_loaders_module = importlib.import_module('langchain.document_loaders') DocumentLoader = getattr(document_loaders_module, "UnstructuredFileLoader") @@ -314,7 +315,7 @@ class KnowledgeFile: def file2docs(self, refresh: bool = False): if self.docs is None or refresh: - logger.info(f"{self.document_loader_name} used for {self.filepath}") + appLogger.info(f"{self.document_loader_name} used for {self.filepath}") loader = get_loader(loader_name=self.document_loader_name, file_path=self.filepath, loader_kwargs=self.loader_kwargs) @@ -439,7 +440,7 @@ def files2docs_in_thread( return True, (file.kb_name, file.filename, file.file2text(**kwargs)) except Exception as e: msg = f"从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}" - logger.error(f'{e.__class__.__name__}: {msg}', + appLogger.error(f'{e.__class__.__name__}: {msg}', exc_info=e if log_verbose else None) return False, (file.kb_name, file.filename, msg) diff --git a/text_splitter/zh_second_title_enhance.py b/text_splitter/zh_second_title_enhance.py index 8ac459c..90b4b31 100644 --- a/text_splitter/zh_second_title_enhance.py +++ b/text_splitter/zh_second_title_enhance.py @@ -158,20 +158,20 @@ def zh_third_title_enhance(docs: Document) -> Document: #print(f"zh_third_title_enhance ....") if len(docs) > 0: for doc in docs: - print(f"zh_third_title_enhance: {doc}") + #print(f"zh_third_title_enhance: {doc}") third_title = get_third_level_title(doc.page_content) if third_title: title = third_title - print(f"title: {title}") + #print(f"title: {title}") elif title: - print(f"title is not none") + #print(f"title is not none") temp_fourth_content = is_fourth_level_content(doc.page_content) if temp_fourth_content: #print(f"is_fourth_level_content : {temp_fourth_content}") doc.page_content = f"{title} {doc.page_content}" else: title = None - print(f"final title: {title}") + #print(f"final title: {title}") return docs else: print("zh_third_title_enhance 文件不存在") @@ -181,16 +181,16 @@ def zh_second_title_enhance(docs: Document) -> Document: title = None if len(docs) > 0: for doc in docs: - print(f"zh_second_title_enhance: {doc}") + #print(f"zh_second_title_enhance: {doc}") second_title = get_second_level_title(doc.page_content) if second_title: title = second_title - print(f"title: {title}") + #print(f"title: {title}") elif title: - print(f"title is not none") + #print(f"title is not none") temp_third_content = is_third_level_content(doc.page_content) if temp_third_content: - print(f"is_third_level_content : {temp_third_content}") + #print(f"is_third_level_content : {temp_third_content}") doc.page_content = f"{title} {doc.page_content}" else: title = None @@ -204,19 +204,19 @@ def zh_first_title_enhance(docs: Document) -> Document: title = None if len(docs) > 0: for doc in docs: - print(f"zh_first_title_enhance: {doc}") + #print(f"zh_first_title_enhance: {doc}") first_title = get_fist_level_title(doc.page_content) if first_title: title = first_title - print(f"title: {title}") + #print(f"title: {title}") elif title: temp_second_content = is_second_level_content(doc.page_content) if temp_second_content: - print(f"is_second_level_content : {temp_second_content}") + #print(f"is_second_level_content : {temp_second_content}") doc.page_content = f"{title} {doc.page_content}" else: title = None - print(f"final title: {title}") + #print(f"final title: {title}") return docs else: print("zh_first_title_enhance 文件不存在")