fix the issue uploadding file and embedding take too long time
This commit is contained in:
parent
5b47cbda32
commit
6ed7002758
|
|
@ -19,7 +19,7 @@ from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from configs import USE_RANKING
|
from configs import USE_RANKING, appLogger
|
||||||
import jieba
|
import jieba
|
||||||
from typing import List, Dict,Tuple
|
from typing import List, Dict,Tuple
|
||||||
|
|
||||||
|
|
@ -41,12 +41,12 @@ def search_docs(
|
||||||
if query:
|
if query:
|
||||||
print(f"search_docs, query:{query}")
|
print(f"search_docs, query:{query}")
|
||||||
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
||||||
print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
|
#print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
|
||||||
|
|
||||||
docs_key = kb.search_content_internal(query,2)
|
docs_key = kb.search_content_internal(query,2)
|
||||||
print(f"search_content_internal, len of docs {len(docs_key)}, docs:{docs_key}")
|
#print(f"search_content_internal, len of docs {len(docs_key)}, docs:{docs_key}")
|
||||||
docs = merge_and_deduplicate(docs, docs_key)
|
docs = merge_and_deduplicate(docs, docs_key)
|
||||||
print(f"after merge_and_deduplicate, len of docs: {len(docs)}, docs:{docs}")
|
#print(f"after merge_and_deduplicate, len of docs: {len(docs)}, docs:{docs}")
|
||||||
if USE_RANKING:
|
if USE_RANKING:
|
||||||
queryList = []
|
queryList = []
|
||||||
queryList.append(query)
|
queryList.append(query)
|
||||||
|
|
@ -57,16 +57,16 @@ def search_docs(
|
||||||
|
|
||||||
vectorizer = TfidfVectorizer()
|
vectorizer = TfidfVectorizer()
|
||||||
tfidf_matrix = vectorizer.fit_transform(doc_contents)
|
tfidf_matrix = vectorizer.fit_transform(doc_contents)
|
||||||
print(f"****** search_docs, tfidf_matrix:{tfidf_matrix}")
|
#print(f"****** search_docs, tfidf_matrix:{tfidf_matrix}")
|
||||||
query_vector = vectorizer.transform(queryList)
|
query_vector = vectorizer.transform(queryList)
|
||||||
print(f"****** search_docs, query_vector:{query_vector}")
|
#print(f"****** search_docs, query_vector:{query_vector}")
|
||||||
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
|
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
|
||||||
print(f"****** search_docs, cosine_similarities:{cosine_similarities}")
|
#print(f"****** search_docs, cosine_similarities:{cosine_similarities}")
|
||||||
|
|
||||||
# 将相似度分数与文档结合
|
# 将相似度分数与文档结合
|
||||||
docs_with_scores = [(doc, score) for doc, score in zip(docs, cosine_similarities)]
|
docs_with_scores = [(doc, score) for doc, score in zip(docs, cosine_similarities)]
|
||||||
sorted_docs = sorted(docs_with_scores, key=lambda x: x[1], reverse=True)
|
sorted_docs = sorted(docs_with_scores, key=lambda x: x[1], reverse=True)
|
||||||
print(f"****** search_docs, sorted_docs:{sorted_docs}")
|
#print(f"****** search_docs, sorted_docs:{sorted_docs}")
|
||||||
i = 0
|
i = 0
|
||||||
for doc in sorted_docs:
|
for doc in sorted_docs:
|
||||||
if i>=top_k:
|
if i>=top_k:
|
||||||
|
|
@ -74,7 +74,7 @@ def search_docs(
|
||||||
else:
|
else:
|
||||||
data.append(DocumentWithVSId(page_content = doc[0][0].page_content,id=doc[0][0].metadata.get("id"), score=doc[0][1],metadata=doc[0][0].metadata))
|
data.append(DocumentWithVSId(page_content = doc[0][0].page_content,id=doc[0][0].metadata.get("id"), score=doc[0][1],metadata=doc[0][0].metadata))
|
||||||
i = i+1
|
i = i+1
|
||||||
print(f"****** search_docs top K , sorted_docs:{data}")
|
#print(f"****** search_docs top K , sorted_docs:{data}")
|
||||||
else:
|
else:
|
||||||
data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
|
data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
|
||||||
|
|
||||||
|
|
@ -355,7 +355,7 @@ def update_docs(
|
||||||
failed_files = {}
|
failed_files = {}
|
||||||
kb_files = []
|
kb_files = []
|
||||||
|
|
||||||
print(f"111111 kb_doc_api update_docs file_name:{file_names},更新的doc 长度:{len(docs)}")
|
appLogger.info(f"111111 kb_doc_api update_docs file_names:{file_names},更新的doc 长度:{len(docs)}")
|
||||||
# 生成需要加载docs的文件列表
|
# 生成需要加载docs的文件列表
|
||||||
for file_name in file_names:
|
for file_name in file_names:
|
||||||
file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name)
|
file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name)
|
||||||
|
|
@ -363,40 +363,35 @@ def update_docs(
|
||||||
if file_detail.get("custom_docs") and not override_custom_docs:
|
if file_detail.get("custom_docs") and not override_custom_docs:
|
||||||
continue
|
continue
|
||||||
if file_name not in docs:
|
if file_name not in docs:
|
||||||
print(f"****kb_doc_api update_docs file_name not in docs")
|
|
||||||
try:
|
try:
|
||||||
|
appLogger.info(f"****kb_doc_api update_docs file_name not in docs,filename:{file_name}")
|
||||||
kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name))
|
kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name))
|
||||||
|
|
||||||
# 从文件生成docs,并进行向量化。
|
|
||||||
# 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile
|
|
||||||
for status, result in files2docs_in_thread(kb_files,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
chunk_overlap=chunk_overlap,
|
|
||||||
zh_title_enhance=zh_title_enhance):
|
|
||||||
if status:
|
|
||||||
print(f"kb_doc_api update_docs 文件生成docs并向量化,filename:{file_name}")
|
|
||||||
kb_name, file_name, new_docs = result
|
|
||||||
kb_file = KnowledgeFile(filename=file_name,
|
|
||||||
knowledge_base_name=knowledge_base_name)
|
|
||||||
kb_file.splited_docs = new_docs
|
|
||||||
kb.update_doc(kb_file, not_refresh_vs_cache=True)
|
|
||||||
else:
|
|
||||||
kb_name, file_name, error = result
|
|
||||||
failed_files[file_name] = error
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"加载文档 {file_name} 时出错:{e}"
|
msg = f"加载文档 {file_name} 时出错:{e}"
|
||||||
logger.error(f'{e.__class__.__name__}: {msg}',
|
logger.error(f'{e.__class__.__name__}: {msg}',
|
||||||
exc_info=e if log_verbose else None)
|
exc_info=e if log_verbose else None)
|
||||||
failed_files[file_name] = msg
|
failed_files[file_name] = msg
|
||||||
|
|
||||||
|
# 从文件生成docs,并进行向量化。
|
||||||
|
# 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile
|
||||||
|
for status, result in files2docs_in_thread(kb_files,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
chunk_overlap=chunk_overlap,
|
||||||
|
zh_title_enhance=zh_title_enhance):
|
||||||
|
if status:
|
||||||
|
kb_name, file_name, new_docs = result
|
||||||
|
kb_file = KnowledgeFile(filename=file_name,
|
||||||
|
knowledge_base_name=knowledge_base_name)
|
||||||
|
kb_file.splited_docs = new_docs
|
||||||
|
kb.update_doc(kb_file, not_refresh_vs_cache=True)
|
||||||
else:
|
else:
|
||||||
print(f"****kb_doc_api update_docs file_name in docs")
|
kb_name, file_name, error = result
|
||||||
|
failed_files[file_name] = error
|
||||||
|
|
||||||
# 将自定义的docs进行向量化
|
# 将自定义的docs进行向量化
|
||||||
for file_name, v in docs.items():
|
for file_name, v in docs.items():
|
||||||
print(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}")
|
|
||||||
try:
|
try:
|
||||||
print(f"kb_doc_api update_docs 自定义的docs 向量化,filename:{file_name}")
|
appLogger.info(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}")
|
||||||
v = [x if isinstance(x, Document) else Document(**x) for x in v]
|
v = [x if isinstance(x, Document) else Document(**x) for x in v]
|
||||||
kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)
|
kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)
|
||||||
kb.update_doc(kb_file, docs=v, not_refresh_vs_cache=True)
|
kb.update_doc(kb_file, docs=v, not_refresh_vs_cache=True)
|
||||||
|
|
@ -409,10 +404,8 @@ def update_docs(
|
||||||
if not not_refresh_vs_cache:
|
if not not_refresh_vs_cache:
|
||||||
kb.save_vector_store()
|
kb.save_vector_store()
|
||||||
|
|
||||||
print(f"kb_doc_api update_docs before finishing, failed_files:{failed_files}")
|
|
||||||
return BaseResponse(code=200, msg=f"更新文档完成", data={"failed_files": failed_files})
|
return BaseResponse(code=200, msg=f"更新文档完成", data={"failed_files": failed_files})
|
||||||
|
|
||||||
|
|
||||||
def download_doc(
|
def download_doc(
|
||||||
knowledge_base_name: str = Query(..., description="知识库名称", examples=["samples"]),
|
knowledge_base_name: str = Query(..., description="知识库名称", examples=["samples"]),
|
||||||
file_name: str = Query(..., description="文件名称", examples=["test.txt"]),
|
file_name: str = Query(..., description="文件名称", examples=["test.txt"]),
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,8 @@ from typing import List, Union, Dict, Optional, Tuple
|
||||||
|
|
||||||
from server.embeddings_api import embed_texts, aembed_texts, embed_documents
|
from server.embeddings_api import embed_texts, aembed_texts, embed_documents
|
||||||
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
|
from configs import logger,appLogger
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
def normalize(embeddings: List[List[float]]) -> np.ndarray:
|
def normalize(embeddings: List[List[float]]) -> np.ndarray:
|
||||||
|
|
@ -108,16 +110,22 @@ class KBService(ABC):
|
||||||
向知识库添加文件
|
向知识库添加文件
|
||||||
如果指定了docs,则不再将文本向量化,并将数据库对应条目标为custom_docs=True
|
如果指定了docs,则不再将文本向量化,并将数据库对应条目标为custom_docs=True
|
||||||
"""
|
"""
|
||||||
|
start_time = time.time() # 记录开始时间
|
||||||
if docs:
|
if docs:
|
||||||
custom_docs = True
|
custom_docs = True
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc.metadata.setdefault("source", kb_file.filename)
|
doc.metadata.setdefault("source", kb_file.filename)
|
||||||
print(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)}")
|
appLogger.info(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)},文件名称:{kb_file.filename}")
|
||||||
else:
|
else:
|
||||||
docs = kb_file.file2text()
|
docs = kb_file.file2text()
|
||||||
custom_docs = False
|
custom_docs = False
|
||||||
print(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)}")
|
appLogger.info(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)},文件名称:{kb_file.filename}")
|
||||||
|
|
||||||
|
end_time = time.time() # 记录结束时间
|
||||||
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
|
appLogger.info(f"add_doc: 加载文件或分块耗时{execution_time}秒")
|
||||||
|
|
||||||
|
start_time = time.time() # 记录开始时间
|
||||||
if docs:
|
if docs:
|
||||||
# 将 metadata["source"] 改为相对路径
|
# 将 metadata["source"] 改为相对路径
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|
@ -130,15 +138,19 @@ class KBService(ABC):
|
||||||
rel_path = Path(source).relative_to(self.doc_path)
|
rel_path = Path(source).relative_to(self.doc_path)
|
||||||
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
|
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"cannot convert absolute path ({source}) to relative path. error is : {e}")
|
appLogger.info(f"cannot convert absolute path ({source}) to relative path. error is : {e}")
|
||||||
self.delete_doc(kb_file)
|
self.delete_doc(kb_file)
|
||||||
print(f"add_doc filepath:{kb_file.filepath},将要执行do_add_doc")
|
#appLogger.info(f"add_doc filepath:{kb_file.filepath},将要执行do_add_doc")
|
||||||
doc_infos = self.do_add_doc(docs, **kwargs)
|
doc_infos = self.do_add_doc(docs, **kwargs)
|
||||||
print(f"add_doc filepath:{kb_file.filepath} 将要执行dd_file_to_db")
|
#appLogger.info(f"add_doc filepath:{kb_file.filepath} 将要执行dd_file_to_db")
|
||||||
status = add_file_to_db(kb_file,
|
status = add_file_to_db(kb_file,
|
||||||
custom_docs=custom_docs,
|
custom_docs=custom_docs,
|
||||||
docs_count=len(docs),
|
docs_count=len(docs),
|
||||||
doc_infos=doc_infos)
|
doc_infos=doc_infos)
|
||||||
|
|
||||||
|
end_time = time.time() # 记录结束时间
|
||||||
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
|
appLogger.info(f"add_doc: 入库耗时:{execution_time}秒")
|
||||||
else:
|
else:
|
||||||
status = False
|
status = False
|
||||||
return status
|
return status
|
||||||
|
|
@ -214,7 +226,7 @@ class KBService(ABC):
|
||||||
def del_doc_by_ids_from_db(self, knowledge_base_name: str , file_name:str, ids: List[str]) -> bool:
|
def del_doc_by_ids_from_db(self, knowledge_base_name: str , file_name:str, ids: List[str]) -> bool:
|
||||||
delete_docs_from_db_by_ids(ids)
|
delete_docs_from_db_by_ids(ids)
|
||||||
update_file_to_db(knowledge_base_name = knowledge_base_name,file_name = file_name)
|
update_file_to_db(knowledge_base_name = knowledge_base_name,file_name = file_name)
|
||||||
print(f"*******KBService del_doc_by_ids_from_db")
|
#print(f"*******KBService del_doc_by_ids_from_db")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -239,7 +251,7 @@ class KBService(ABC):
|
||||||
通过file_name或metadata检索Document
|
通过file_name或metadata检索Document
|
||||||
'''
|
'''
|
||||||
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
|
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
|
||||||
print(f"kb_doc_api list_docs_from_db: {doc_infos}")
|
#appLogger.info(f"kb_doc_api list_docs_from_db: {doc_infos}")
|
||||||
docs = []
|
docs = []
|
||||||
for x in doc_infos:
|
for x in doc_infos:
|
||||||
doc_info = self.get_doc_by_ids([x["id"]])
|
doc_info = self.get_doc_by_ids([x["id"]])
|
||||||
|
|
@ -257,7 +269,7 @@ class KBService(ABC):
|
||||||
else:
|
else:
|
||||||
# 处理 doc_info 是 NoneType 或者不是列表的情况
|
# 处理 doc_info 是 NoneType 或者不是列表的情况
|
||||||
# 可以选择跳过当前循环迭代或执行其他操作
|
# 可以选择跳过当前循环迭代或执行其他操作
|
||||||
print("base.py list_docs 返回为空")
|
#print("base.py list_docs 返回为空")
|
||||||
pass
|
pass
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from server.knowledge_base.kb_service.base import KBService, SupportedVSType
|
||||||
from server.knowledge_base.utils import KnowledgeFile
|
from server.knowledge_base.utils import KnowledgeFile
|
||||||
from server.utils import load_local_embeddings
|
from server.utils import load_local_embeddings
|
||||||
from elasticsearch import Elasticsearch,BadRequestError
|
from elasticsearch import Elasticsearch,BadRequestError
|
||||||
from configs import logger
|
from configs import logger,appLogger
|
||||||
from configs import kbs_config
|
from configs import kbs_config
|
||||||
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
|
|
||||||
|
|
@ -30,13 +30,13 @@ class ESKBService(KBService):
|
||||||
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}",
|
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}",
|
||||||
basic_auth=(self.user,self.password))
|
basic_auth=(self.user,self.password))
|
||||||
else:
|
else:
|
||||||
logger.warning("ES未配置用户名和密码")
|
appLogger.warning("ES未配置用户名和密码")
|
||||||
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}")
|
self.es_client_python = Elasticsearch(f"http://{self.IP}:{self.PORT}")
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
logger.error("连接到 Elasticsearch 失败!")
|
appLogger.error("连接到 Elasticsearch 失败!")
|
||||||
raise ConnectionError
|
raise ConnectionError
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error 发生 : {e}")
|
appLogger.error(f"Error 发生 : {e}")
|
||||||
raise e
|
raise e
|
||||||
try:
|
try:
|
||||||
# 首先尝试通过es_client_python创建
|
# 首先尝试通过es_client_python创建
|
||||||
|
|
@ -51,8 +51,8 @@ class ESKBService(KBService):
|
||||||
}
|
}
|
||||||
self.es_client_python.indices.create(index=self.index_name, mappings=mappings)
|
self.es_client_python.indices.create(index=self.index_name, mappings=mappings)
|
||||||
except BadRequestError as e:
|
except BadRequestError as e:
|
||||||
logger.error("创建索引失败,重新")
|
appLogger.error("创建索引失败,重新")
|
||||||
logger.error(e)
|
appLogger.error(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# langchain ES 连接、创建索引
|
# langchain ES 连接、创建索引
|
||||||
|
|
@ -67,7 +67,7 @@ class ESKBService(KBService):
|
||||||
es_password=self.password
|
es_password=self.password
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning("ES未配置用户名和密码")
|
appLogger.warning("ES未配置用户名和密码")
|
||||||
self.db_init = ElasticsearchStore(
|
self.db_init = ElasticsearchStore(
|
||||||
es_url=f"http://{self.IP}:{self.PORT}",
|
es_url=f"http://{self.IP}:{self.PORT}",
|
||||||
index_name=self.index_name,
|
index_name=self.index_name,
|
||||||
|
|
@ -77,10 +77,10 @@ class ESKBService(KBService):
|
||||||
)
|
)
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
print("### 初始化 Elasticsearch 失败!")
|
print("### 初始化 Elasticsearch 失败!")
|
||||||
logger.error("### 初始化 Elasticsearch 失败!")
|
appLogger.error("### 初始化 Elasticsearch 失败!")
|
||||||
raise ConnectionError
|
raise ConnectionError
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error 发生 : {e}")
|
appLogger.error(f"Error 发生 : {e}")
|
||||||
raise e
|
raise e
|
||||||
try:
|
try:
|
||||||
# 尝试通过db_init创建索引
|
# 尝试通过db_init创建索引
|
||||||
|
|
@ -89,8 +89,8 @@ class ESKBService(KBService):
|
||||||
dims_length=self.dims_length
|
dims_length=self.dims_length
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("创建索引失败...")
|
appLogger.error("创建索引失败...")
|
||||||
logger.error(e)
|
appLogger.error(e)
|
||||||
# raise e
|
# raise e
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -156,9 +156,9 @@ class ESKBService(KBService):
|
||||||
except ConnectionError as ce:
|
except ConnectionError as ce:
|
||||||
print(ce)
|
print(ce)
|
||||||
print("连接到 Elasticsearch 失败!")
|
print("连接到 Elasticsearch 失败!")
|
||||||
logger.error("连接到 Elasticsearch 失败!")
|
appLogger.error("连接到 Elasticsearch 失败!")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error 发生 : {e}")
|
appLogger.error(f"Error 发生 : {e}")
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -172,7 +172,7 @@ class ESKBService(KBService):
|
||||||
|
|
||||||
def searchbyContent(self, query:str, top_k: int = 2):
|
def searchbyContent(self, query:str, top_k: int = 2):
|
||||||
if self.es_client_python.indices.exists(index=self.index_name):
|
if self.es_client_python.indices.exists(index=self.index_name):
|
||||||
print(f"******ESKBService searchByContent {self.index_name},query:{query}")
|
appLogger.info(f"******ESKBService searchByContent {self.index_name},query:{query}")
|
||||||
tem_query = {
|
tem_query = {
|
||||||
"query": {"match": {
|
"query": {"match": {
|
||||||
"context": "*" + query + "*"
|
"context": "*" + query + "*"
|
||||||
|
|
@ -199,7 +199,7 @@ class ESKBService(KBService):
|
||||||
|
|
||||||
def searchbyContentInternal(self, query:str, top_k: int = 2):
|
def searchbyContentInternal(self, query:str, top_k: int = 2):
|
||||||
if self.es_client_python.indices.exists(index=self.index_name):
|
if self.es_client_python.indices.exists(index=self.index_name):
|
||||||
print(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
|
appLogger.info(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
|
||||||
tem_query = {
|
tem_query = {
|
||||||
"query": {"match": {
|
"query": {"match": {
|
||||||
"context": "*" + query + "*"
|
"context": "*" + query + "*"
|
||||||
|
|
@ -231,19 +231,19 @@ class ESKBService(KBService):
|
||||||
metadata=result["_source"]["metadata"],
|
metadata=result["_source"]["metadata"],
|
||||||
))
|
))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ES Docs Get Error! {e}")
|
appLogger.error(f"ES Docs Get Error! {e}")
|
||||||
return result_list
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
def del_doc_by_ids(self,ids: List[str]) -> bool:
|
def del_doc_by_ids(self,ids: List[str]) -> bool:
|
||||||
print(f"es_kb_service del_doc_by_ids")
|
appLogger.info(f"es_kb_service del_doc_by_ids")
|
||||||
for doc_id in ids:
|
for doc_id in ids:
|
||||||
try:
|
try:
|
||||||
self.es_client_python.delete(index=self.index_name,
|
self.es_client_python.delete(index=self.index_name,
|
||||||
id=doc_id,
|
id=doc_id,
|
||||||
refresh=True)
|
refresh=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ES Docs Delete Error! {e}")
|
appLogger.error(f"ES Docs Delete Error! {e}")
|
||||||
|
|
||||||
|
|
||||||
def do_delete_doc(self, kb_file, **kwargs):
|
def do_delete_doc(self, kb_file, **kwargs):
|
||||||
|
|
@ -262,7 +262,7 @@ class ESKBService(KBService):
|
||||||
search_results = self.es_client_python.search(index=self.index_name, body=query,size=200)
|
search_results = self.es_client_python.search(index=self.index_name, body=query,size=200)
|
||||||
delete_list = [hit["_id"] for hit in search_results['hits']['hits']]
|
delete_list = [hit["_id"] for hit in search_results['hits']['hits']]
|
||||||
size = len(delete_list)
|
size = len(delete_list)
|
||||||
print(f"***do_delete_doc: 删除的size:{size}, {delete_list}")
|
#print(f"***do_delete_doc: 删除的size:{size}, {delete_list}")
|
||||||
if len(delete_list) == 0:
|
if len(delete_list) == 0:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
|
@ -272,7 +272,7 @@ class ESKBService(KBService):
|
||||||
id=doc_id,
|
id=doc_id,
|
||||||
refresh=True)
|
refresh=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"ES Docs Delete Error! {e}")
|
appLogger.error(f"ES Docs Delete Error! {e}")
|
||||||
|
|
||||||
# self.db_init.delete(ids=delete_list)
|
# self.db_init.delete(ids=delete_list)
|
||||||
#self.es_client_python.indices.refresh(index=self.index_name)
|
#self.es_client_python.indices.refresh(index=self.index_name)
|
||||||
|
|
@ -300,8 +300,8 @@ class ESKBService(KBService):
|
||||||
if len(search_results["hits"]["hits"]) == 0:
|
if len(search_results["hits"]["hits"]) == 0:
|
||||||
raise ValueError("召回元素个数为0")
|
raise ValueError("召回元素个数为0")
|
||||||
info_docs = [{"id":hit["_id"], "metadata": hit["_source"]["metadata"]} for hit in search_results["hits"]["hits"]]
|
info_docs = [{"id":hit["_id"], "metadata": hit["_source"]["metadata"]} for hit in search_results["hits"]["hits"]]
|
||||||
size = len(info_docs)
|
#size = len(info_docs)
|
||||||
print(f"do_add_doc 召回元素个数:{size}")
|
#print(f"do_add_doc 召回元素个数:{size}")
|
||||||
return info_docs
|
return info_docs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ from configs import (
|
||||||
OVERLAP_SIZE,
|
OVERLAP_SIZE,
|
||||||
ZH_TITLE_ENHANCE,
|
ZH_TITLE_ENHANCE,
|
||||||
logger,
|
logger,
|
||||||
|
appLogger,
|
||||||
log_verbose,
|
log_verbose,
|
||||||
text_splitter_dict,
|
text_splitter_dict,
|
||||||
LLM_MODELS,
|
LLM_MODELS,
|
||||||
|
|
@ -94,7 +95,7 @@ def list_files_from_folder(kb_name: str):
|
||||||
process_entry(entry)
|
process_entry(entry)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error 发生 : {e}")
|
appLogger.error(f"Error 发生 : {e}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -175,7 +176,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None):
|
||||||
DocumentLoader = getattr(document_loaders_module, loader_name)
|
DocumentLoader = getattr(document_loaders_module, loader_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"为文件{file_path}查找加载器{loader_name}时出错:{e}"
|
msg = f"为文件{file_path}查找加载器{loader_name}时出错:{e}"
|
||||||
logger.error(f'{e.__class__.__name__}: {msg}',
|
appLogger.error(f'{e.__class__.__name__}: {msg}',
|
||||||
exc_info=e if log_verbose else None)
|
exc_info=e if log_verbose else None)
|
||||||
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
||||||
DocumentLoader = getattr(document_loaders_module, "UnstructuredFileLoader")
|
DocumentLoader = getattr(document_loaders_module, "UnstructuredFileLoader")
|
||||||
|
|
@ -314,7 +315,7 @@ class KnowledgeFile:
|
||||||
|
|
||||||
def file2docs(self, refresh: bool = False):
|
def file2docs(self, refresh: bool = False):
|
||||||
if self.docs is None or refresh:
|
if self.docs is None or refresh:
|
||||||
logger.info(f"{self.document_loader_name} used for {self.filepath}")
|
appLogger.info(f"{self.document_loader_name} used for {self.filepath}")
|
||||||
loader = get_loader(loader_name=self.document_loader_name,
|
loader = get_loader(loader_name=self.document_loader_name,
|
||||||
file_path=self.filepath,
|
file_path=self.filepath,
|
||||||
loader_kwargs=self.loader_kwargs)
|
loader_kwargs=self.loader_kwargs)
|
||||||
|
|
@ -439,7 +440,7 @@ def files2docs_in_thread(
|
||||||
return True, (file.kb_name, file.filename, file.file2text(**kwargs))
|
return True, (file.kb_name, file.filename, file.file2text(**kwargs))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg = f"从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}"
|
msg = f"从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}"
|
||||||
logger.error(f'{e.__class__.__name__}: {msg}',
|
appLogger.error(f'{e.__class__.__name__}: {msg}',
|
||||||
exc_info=e if log_verbose else None)
|
exc_info=e if log_verbose else None)
|
||||||
return False, (file.kb_name, file.filename, msg)
|
return False, (file.kb_name, file.filename, msg)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -158,20 +158,20 @@ def zh_third_title_enhance(docs: Document) -> Document:
|
||||||
#print(f"zh_third_title_enhance ....")
|
#print(f"zh_third_title_enhance ....")
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
print(f"zh_third_title_enhance: {doc}")
|
#print(f"zh_third_title_enhance: {doc}")
|
||||||
third_title = get_third_level_title(doc.page_content)
|
third_title = get_third_level_title(doc.page_content)
|
||||||
if third_title:
|
if third_title:
|
||||||
title = third_title
|
title = third_title
|
||||||
print(f"title: {title}")
|
#print(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
print(f"title is not none")
|
#print(f"title is not none")
|
||||||
temp_fourth_content = is_fourth_level_content(doc.page_content)
|
temp_fourth_content = is_fourth_level_content(doc.page_content)
|
||||||
if temp_fourth_content:
|
if temp_fourth_content:
|
||||||
#print(f"is_fourth_level_content : {temp_fourth_content}")
|
#print(f"is_fourth_level_content : {temp_fourth_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
print(f"final title: {title}")
|
#print(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_third_title_enhance 文件不存在")
|
print("zh_third_title_enhance 文件不存在")
|
||||||
|
|
@ -181,16 +181,16 @@ def zh_second_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
print(f"zh_second_title_enhance: {doc}")
|
#print(f"zh_second_title_enhance: {doc}")
|
||||||
second_title = get_second_level_title(doc.page_content)
|
second_title = get_second_level_title(doc.page_content)
|
||||||
if second_title:
|
if second_title:
|
||||||
title = second_title
|
title = second_title
|
||||||
print(f"title: {title}")
|
#print(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
print(f"title is not none")
|
#print(f"title is not none")
|
||||||
temp_third_content = is_third_level_content(doc.page_content)
|
temp_third_content = is_third_level_content(doc.page_content)
|
||||||
if temp_third_content:
|
if temp_third_content:
|
||||||
print(f"is_third_level_content : {temp_third_content}")
|
#print(f"is_third_level_content : {temp_third_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
|
|
@ -204,19 +204,19 @@ def zh_first_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
print(f"zh_first_title_enhance: {doc}")
|
#print(f"zh_first_title_enhance: {doc}")
|
||||||
first_title = get_fist_level_title(doc.page_content)
|
first_title = get_fist_level_title(doc.page_content)
|
||||||
if first_title:
|
if first_title:
|
||||||
title = first_title
|
title = first_title
|
||||||
print(f"title: {title}")
|
#print(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
temp_second_content = is_second_level_content(doc.page_content)
|
temp_second_content = is_second_level_content(doc.page_content)
|
||||||
if temp_second_content:
|
if temp_second_content:
|
||||||
print(f"is_second_level_content : {temp_second_content}")
|
#print(f"is_second_level_content : {temp_second_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
print(f"final title: {title}")
|
#print(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_first_title_enhance 文件不存在")
|
print("zh_first_title_enhance 文件不存在")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue