diff --git a/libs/chatchat-server/chatchat/server/callback_handler/agent_callback_handler.py b/libs/chatchat-server/chatchat/server/callback_handler/agent_callback_handler.py index 157f968..a543c54 100644 --- a/libs/chatchat-server/chatchat/server/callback_handler/agent_callback_handler.py +++ b/libs/chatchat-server/chatchat/server/callback_handler/agent_callback_handler.py @@ -8,6 +8,9 @@ from uuid import UUID from langchain.callbacks import AsyncIteratorCallbackHandler from langchain.schema import AgentAction, AgentFinish from langchain_core.outputs import LLMResult +from chatchat.utils import build_logger + +logger = build_logger() def dumps(obj: Dict) -> str: @@ -31,6 +34,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): self.queue = asyncio.Queue() self.done = asyncio.Event() self.out = True + logger.info(f"init....") async def on_llm_start( self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any @@ -41,6 +45,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): } self.done.clear() self.queue.put_nowait(dumps(data)) + logger.info(f"prompts:{prompts}") async def on_llm_new_token(self, token: str, **kwargs: Any) -> None: special_tokens = ["\nAction:", "\nObservation:", "<|observation|>"] @@ -79,6 +84,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): } self.done.clear() self.queue.put_nowait(dumps(data)) + logger.info(f"messages:{messages}") async def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: data = { @@ -86,6 +92,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): "text": response.generations[0][0].message.content, } self.queue.put_nowait(dumps(data)) + logger.info(f"response:{response.json}") async def on_llm_error( self, error: Exception | KeyboardInterrupt, **kwargs: Any @@ -114,6 +121,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): "tool_input": input_str, } self.queue.put_nowait(dumps(data)) + logger.info(f"input_str:{input_str}") async def on_tool_end( self, @@ -132,6 +140,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): } # self.done.clear() self.queue.put_nowait(dumps(data)) + logger.info(f"output:{output}") async def on_tool_error( self, @@ -151,6 +160,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): } # self.done.clear() self.queue.put_nowait(dumps(data)) + logger.error(f"error:{error.__class__}") async def on_agent_action( self, @@ -168,6 +178,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): "text": action.log, } self.queue.put_nowait(dumps(data)) + logger.error(f"tool_name:{action.tool},tool_input:{ action.tool_input}") async def on_agent_finish( self, @@ -188,6 +199,7 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): "text": finish.return_values["output"], } self.queue.put_nowait(dumps(data)) + logger.error(f"data:{data}") async def on_chain_end( self, @@ -200,3 +212,4 @@ class AgentExecutorAsyncIteratorCallbackHandler(AsyncIteratorCallbackHandler): ) -> None: self.done.set() self.out = True + logger.info(f"outputs:{outputs}") diff --git a/libs/chatchat-server/chatchat/server/file_rag/document_loaders/mypdfloader.py b/libs/chatchat-server/chatchat/server/file_rag/document_loaders/mypdfloader.py index bbdd9c7..71ec989 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/document_loaders/mypdfloader.py +++ b/libs/chatchat-server/chatchat/server/file_rag/document_loaders/mypdfloader.py @@ -5,10 +5,12 @@ import numpy as np import tqdm from langchain_community.document_loaders.unstructured import UnstructuredFileLoader from PIL import Image - +import re from chatchat.settings import Settings from chatchat.server.file_rag.document_loaders.ocr import get_ocr +from chatchat.utils import build_logger +logger = build_logger() class RapidOCRPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: @@ -53,9 +55,11 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): ) b_unit.refresh() text = page.get_text("") - resp += text + "\n" - + # resp += text + "\n" + text_lines = text.strip().split("\n") + logger.info(f"****page:{i+1}****,文字内容:{text_lines}") img_list = page.get_image_info(xrefs=True) + ocr_result = [] for img in img_list: if xref := img.get("xref"): bbox = img["bbox"] @@ -86,8 +90,20 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): ocr_result = [line[1] for line in result] resp += "\n".join(ocr_result) + if (len(ocr_result)>0): + resp += "\n".join(ocr_result) + else: + if text_lines: + # 假设页码在最后一行 + if text_lines[-1].isdigit(): + text = "\n".join(text_lines[:-1]) + print(f"******去除了页码") + resp += text + "\n" # 更新进度 b_unit.update(1) + resp = re.sub(r'((? List[str]: + # Now that we have the separator, split the text + if separator: + if keep_separator: + # The parentheses in the pattern keep the delimiters in the result. + _splits = re.split(f"({separator})", text) + splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])] + if len(_splits) % 2 == 1: + splits += _splits[-1:] + # splits = [_splits[0]] + splits + else: + splits = re.split(separator, text) + else: + splits = list(text) + return [s for s in splits if s != ""] + +def customerLen(text:str)->int: + length = len(re.sub(r'[\s\n]+', '', text)) + return length + +class CustomerChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): + def __init__( + self, + separators: Optional[List[str]] = None, + keep_separator: bool = True, + is_separator_regex: bool = True, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(keep_separator=keep_separator, **kwargs) + self._separators = separators or [ + First_SEPARATOE, + Second_SEPARATOE, + Third_SEPARATOE, + Fourth_SEPARATOE + #"\n\n", + #"\n", + # "。|!|?", + # "\.\s|\!\s|\?\s", + # ";|;\s", + # ",|,\s" + ] + self._is_separator_regex = is_separator_regex + self.is_recursive = False + self._length_function = customerLen + + def _split_text(self, text: str, separators: List[str]) -> List[str]: + """Split incoming text and return chunks.""" + #print(f"***********************************ChineseRecursiveTextSplitter***********************************") + final_chunks = [] + # Get appropriate separator to use + separator = separators[-1] + new_separators = [] + if self.is_recursive == False: + #一级目录 + text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块 + text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的 + text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换 + text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章 + + #二级目录 + text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2 + text = re.sub(r'(\n+(? bool: + # 文本长度为0,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty or longer than 25.") + return "" + + splitlines = text.splitlines() + first_line = splitlines[0] + # 文本中有标点符号,就不是title + ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + if ENDS_IN_PUNCT_RE.search(first_line) is not None: + return "" + FIRST_TITLE = r'((? str: + # 文本长度为0的话,肯定不是title + lenght = len(text) + if lenght == 0: + print("Not a title. Text is empty or longer than 25.") + return "" + + splitlines = text.splitlines() + first_line = splitlines[0] + # 文本中有标点符号,就不是title + # ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + # ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + # if ENDS_IN_PUNCT_RE.search(first_line) is not None: + # return "" + + #3 **** + #3.1 ***** + #3.1.1 ***** + #另一个分块 + #3.1.2 ***** 所以二级目录可能在第二行 和第一行 + Second_TITLE = r'((?1: + Second_line = splitlines[1] + if TITLE_PUNCT_RE.search(Second_line) is not None: + return Second_line + return "" + +#judge if it is 2nd level content +def is_second_level_content( + text: str, +) -> bool: + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty.") + return False + + splitlines = text.splitlines() + first_line = splitlines[0] + + Second_TITLE = r'((? bool: + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty.") + return False + + splitlines = text.splitlines() + first_line = splitlines[0] + + Third_TITLE = r'((? str: + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty or longer than 25.") + return "" + + splitlines = text.splitlines() + first_line = splitlines[0] + # 文本中有标点符号,就不是title + # ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + # ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + # if ENDS_IN_PUNCT_RE.search(first_line) is not None: + # return "" + + #3 **** + #3.1 ***** + #3.1.1 ***** + #3.1.1.1 ***** + #另一个分块 + #3.1.1.2 ***** 所以三级级目录可能在第三行 和第二行及第一行 + Third_TITLE = r'((?1: + Second_line = splitlines[1] + if TITLE_PUNCT_RE.search(Second_line) is not None: + return Second_line + else: + if len(splitlines)>2: + Second_line = splitlines[2] + if TITLE_PUNCT_RE.search(Second_line) is not None: + return Second_line + + return "" + +#judge if it is 4th level content +def is_fourth_level_content( + text: str, +) -> bool: + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty.") + return False + + splitlines = text.splitlines() + first_line = splitlines[0] + + Third_TITLE = r'((? Document: + title = None + #print(f"zh_third_title_enhance ....") + if len(docs) > 0: + for doc in docs: + #print(f"zh_third_title_enhance: {doc}") + third_title = get_third_level_title(doc.page_content) + if third_title: + title = third_title + #print(f"title: {title}") + elif title: + #print(f"title is not none") + temp_fourth_content = is_fourth_level_content(doc.page_content) + if temp_fourth_content: + #print(f"is_fourth_level_content : {temp_fourth_content}") + doc.page_content = f"{title} {doc.page_content}" + else: + title = None + #print(f"final title: {title}") + return docs + else: + print("zh_third_title_enhance 文件不存在") + +#给三级被分开的内容 增加二级标题 +def zh_second_title_enhance(docs: Document) -> Document: + title = None + if len(docs) > 0: + for doc in docs: + logger.debug(f"zh_second_title_enhance: {doc}") + second_title = get_second_level_title(doc.page_content) + if second_title: + title = second_title + logger.debug(f"title: {title}") + elif title: + #print(f"title is not none") + temp_third_content = is_third_level_content(doc.page_content) + if temp_third_content: + #print(f"is_third_level_content : {temp_third_content}") + doc.page_content = f"{title} {doc.page_content}" + else: + title = None + logger.debug(f"final title: {title}") + return docs + else: + print("zh_second_title_enhance 文件不存在") + +#给二级被分开的内容 增加一级标题 +def zh_first_title_enhance(docs: Document) -> Document: + title = None + if len(docs) > 0: + for doc in docs: + logger.debug(f"zh_first_title_enhance: {doc}") + first_title = get_fist_level_title(doc.page_content) + if first_title: + title = first_title + logger.debug(f"title: {title}") + elif title: + temp_second_content = is_second_level_content(doc.page_content) + if temp_second_content: + logger.debug(f"is_second_level_content : {temp_second_content}") + doc.page_content = f"{title} {doc.page_content}" + else: + title = None + logger.debug(f"final title: {title}") + return docs + else: + print("zh_first_title_enhance 文件不存在") + + +if __name__ == "__main__": + str = """1 总 则\n1.1 本导则是编制和审查城市电力网(以下简称城网)规划的指导性文件,其 适用范围为国家电网公司所属的各网省公司、城市供电公司。\n1.2 城网是城市行政区划内为城市供电的各级电压电网的总称。城网是电力系 统的主要负荷中心,作为城市的重要基础设施之一,与城市的社会经济发展密切 相关。各城市应根据《中华人民共和国城市规划法》和《中华人民共和国电力法》 的相关规定,编制城网规划,并纳入相应的城市总体规划和各地区详细规划中。\n1.3 城网规划是城市总体规划的重要组成部分,应与城市的各项发展规划相互 配合、同步实施,做到与城市规划相协调,落实规划中所确定的线路走廊和地下 通道、变电站和配电室站址等供电设施用地。\n1.4 城网规划的目的是通过科学的规划,建设网络坚强、结构合理、安全可靠、 运行灵活、节能环保、经济高效的城市电网,不断提高城网供电能力和电能质量, 以满足城市经济增长和社会发展的需要。 ' metadata={'source': '/home/bns001/Langchain-Chatchat_0.2.9/knowledge_base/test/content/资产全寿命周期管理体系实施指南.docx'}""" + title = get_fist_level_title(str) + print(title) diff --git a/libs/chatchat-server/chatchat/server/knowledge_base/kb_doc_api.py b/libs/chatchat-server/chatchat/server/knowledge_base/kb_doc_api.py index 8e728ef..1c18020 100644 --- a/libs/chatchat-server/chatchat/server/knowledge_base/kb_doc_api.py +++ b/libs/chatchat-server/chatchat/server/knowledge_base/kb_doc_api.py @@ -71,6 +71,7 @@ def search_docs( if kb is not None: if query: docs = kb.search_docs(query, top_k, score_threshold) + logger.info(f"search_docs, query:{query},top_k:{top_k},score_threshold:{score_threshold}") # data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs] data = [DocumentWithVSId(**{"id": x.metadata.get("id"), **x.dict()}) for x in docs] elif file_name or metadata: