diff --git a/requirements.txt b/requirements.txt index db65bc8..412420c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -78,3 +78,4 @@ streamlit-modal>=0.1.0 streamlit-aggrid>=0.3.4.post3 watchdog>=3.0.0 docx2txt +elasticsearch \ No newline at end of file diff --git a/server/chat/knowledge_base_chat.py b/server/chat/knowledge_base_chat.py index 60956b4..79ed159 100644 --- a/server/chat/knowledge_base_chat.py +++ b/server/chat/knowledge_base_chat.py @@ -78,11 +78,12 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入", max_tokens=max_tokens, callbacks=[callback], ) - docs = await run_in_threadpool(search_docs, - query=query, - knowledge_base_name=knowledge_base_name, - top_k=top_k, - score_threshold=score_threshold) + docs = search_docs(query, knowledge_base_name, 10, score_threshold) + # docs = await run_in_threadpool(search_docs, + # query=query, + # knowledge_base_name=knowledge_base_name, + # top_k=10, + # score_threshold=score_threshold) # 加入reranker if USE_RERANKER: @@ -99,6 +100,7 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入", query=query) print("---------after rerank------------------") print(docs) + context = "\n".join([doc.page_content for doc in docs]) if len(docs) == 0: # 如果没有找到相关文档,使用empty模板 diff --git a/server/knowledge_base/kb_service/es_kb_service.py b/server/knowledge_base/kb_service/es_kb_service.py index 14d6e43..77f0d02 100644 --- a/server/knowledge_base/kb_service/es_kb_service.py +++ b/server/knowledge_base/kb_service/es_kb_service.py @@ -36,12 +36,12 @@ class ESKBService(KBService): except Exception as e: logger.error(f"Error 发生 : {e}") raise e - try: - # 首先尝试通过es_client_python创建 - self.es_client_python.indices.create(index=self.index_name) - except BadRequestError as e: - logger.error("创建索引失败,重新") - logger.error(e) + # try: + # # 首先尝试通过es_client_python创建 + # self.es_client_python.indices.create(index=self.index_name) + # except BadRequestError as e: + # logger.error("创建索引失败,重新") + # logger.error(e) try: # langchain ES 连接、创建索引 @@ -156,15 +156,17 @@ class ESKBService(KBService): logger.error(f"ES Docs Delete Error! {e}") def do_delete_doc(self, kb_file, **kwargs): + base_file_name = os.path.basename(kb_file.filepath) if self.es_client_python.indices.exists(index=self.index_name): # 从向量数据库中删除索引(文档名称是Keyword) query = { "query": { "term": { - "metadata.source.keyword": kb_file.filepath + "metadata.source.keyword": base_file_name } } } + print(f"***do_delete_doc: kb_file.filepath:{kb_file.filepath}, base_file_name:{base_file_name}") # 注意设置size,默认返回10个。 search_results = self.es_client_python.search(body=query, size=50) delete_list = [hit["_id"] for hit in search_results['hits']['hits']] diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index f47a792..d198c68 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -66,6 +66,7 @@ def list_files_from_folder(kb_name: str): if is_skiped_path(entry.path): return + if entry.is_symlink(): target_path = os.path.realpath(entry.path) with os.scandir(target_path) as target_it: @@ -79,10 +80,16 @@ def list_files_from_folder(kb_name: str): for sub_entry in it: process_entry(sub_entry) - with os.scandir(doc_path) as it: - for entry in it: - process_entry(entry) + #added by weiweiwang 2024.1.3 for catch exception + try: + print(f"list_files_from_folder,doc_path:{doc_path}") + with os.scandir(doc_path) as it: + for entry in it: + process_entry(entry) + except Exception as e: + logger.error(f"Error 发生 : {e}") + return result #PDFPlumberLoader diff --git a/server/reranker/reranker.py b/server/reranker/reranker.py index c6cbebf..cac8d78 100644 --- a/server/reranker/reranker.py +++ b/server/reranker/reranker.py @@ -46,7 +46,7 @@ class LangchainReranker(BaseDocumentCompressor): # self.activation_fct=activation_fct # self.apply_softmax=apply_softmax - self._model = CrossEncoder(model_name=model_name_or_path, max_length=1024, device=device) + self._model = CrossEncoder(model_name=model_name_or_path, max_length=512, device=device) super().__init__( top_n=top_n, model_name_or_path=model_name_or_path, diff --git a/webui_pages/knowledge_base/knowledge_base.py b/webui_pages/knowledge_base/knowledge_base.py index 31fc151..fd61d7f 100644 --- a/webui_pages/knowledge_base/knowledge_base.py +++ b/webui_pages/knowledge_base/knowledge_base.py @@ -285,6 +285,7 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None): st.divider() + #added by weiweiw 2024.1.3 # cols = st.columns(3) # if cols[0].button( @@ -318,6 +319,7 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None): # with st.sidebar: # keyword = st.text_input("查询关键字") # top_k = st.slider("匹配条数", 1, 100, 3) + #ending added by weiweiw 2024.1.3 st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。") docs = []