diff --git a/requirements.txt b/requirements.txt index 1aef2e4..db65bc8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -77,3 +77,4 @@ streamlit-chatbox==1.1.11 streamlit-modal>=0.1.0 streamlit-aggrid>=0.3.4.post3 watchdog>=3.0.0 +docx2txt diff --git a/requirements_api.txt b/requirements_api.txt index c18c3a9..174e076 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -67,3 +67,4 @@ arxiv>=2.0.0 youtube-search>=2.1.2 duckduckgo-search>=3.9.9 metaphor-python>=0.1.23 +docx2txt \ No newline at end of file diff --git a/requirements_webui.txt b/requirements_webui.txt index a7d02f6..160d823 100644 --- a/requirements_webui.txt +++ b/requirements_webui.txt @@ -7,3 +7,4 @@ streamlit-modal>=0.1.0 streamlit-aggrid>=0.3.4.post3 httpx[brotli,http2,socks]>=0.25.2 watchdog>=3.0.0 +docx2txt diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index 09a264f..6c3a1f8 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -35,7 +35,9 @@ def search_docs( data = [] if kb is not None: if query: + print(f"search_docs, query:{query}") docs = kb.search_docs(query, top_k, score_threshold) + print(f"search_docs, docs:{docs}") data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs] elif file_name or metadata: data = kb.list_docs(file_name=file_name, metadata=metadata) @@ -155,6 +157,8 @@ def upload_docs( failed_files = {} file_names = list(docs.keys()) + print(f"upload_docs, file_names:{file_names}") + # 先将上传的文件保存到磁盘 for result in _save_files_in_thread(files, knowledge_base_name=knowledge_base_name, override=override): filename = result["data"]["file_name"] @@ -164,7 +168,9 @@ def upload_docs( if filename not in file_names: file_names.append(filename) + # 对保存的文件进行向量化 + print(f"upload_docs, to_vector_store:{to_vector_store}") if to_vector_store: result = update_docs( knowledge_base_name=knowledge_base_name, diff --git a/server/knowledge_base/kb_service/es_kb_service.py b/server/knowledge_base/kb_service/es_kb_service.py index 4a408cb..14d6e43 100644 --- a/server/knowledge_base/kb_service/es_kb_service.py +++ b/server/knowledge_base/kb_service/es_kb_service.py @@ -141,6 +141,7 @@ class ESKBService(KBService): def do_search(self, query:str, top_k: int, score_threshold: float): # 文本相似性检索 + print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}") docs = self.db_init.similarity_search_with_score(query=query, k=top_k) return docs diff --git a/server/knowledge_base/kb_service/faiss_kb_service.py b/server/knowledge_base/kb_service/faiss_kb_service.py index f073b4e..d4b94eb 100644 --- a/server/knowledge_base/kb_service/faiss_kb_service.py +++ b/server/knowledge_base/kb_service/faiss_kb_service.py @@ -62,6 +62,7 @@ class FaissKBService(KBService): top_k: int, score_threshold: float = SCORE_THRESHOLD, ) -> List[Document]: + print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}") embed_func = EmbeddingsFunAdapter(self.embed_model) embeddings = embed_func.embed_query(query) with self.load_vector_store().acquire() as vs: