From a72de9ecc4f48686f90ec65ac6162f9be68f9ef3 Mon Sep 17 00:00:00 2001 From: weiweiw Date: Thu, 16 Jan 2025 18:08:34 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A0=87=E9=A2=98=E5=8A=A0=E5=BC=BA=E4=BC=98?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../customer_zh_title_enhance.py | 124 +++++++++++------- 1 file changed, 79 insertions(+), 45 deletions(-) diff --git a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/customer_zh_title_enhance.py b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/customer_zh_title_enhance.py index c3462a1..e9d347c 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/customer_zh_title_enhance.py +++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/customer_zh_title_enhance.py @@ -9,7 +9,7 @@ def get_fist_level_title( ) -> bool: # 文本长度为0,肯定不是title if len(text) == 0: - print("Not a title. Text is empty or longer than 25.") + logger.info("Not a title. Text is empty or longer than 25.") return "" splitlines = text.splitlines() @@ -33,7 +33,7 @@ def get_second_level_title( # 文本长度为0的话,肯定不是title lenght = len(text) if lenght == 0: - print("Not a title. Text is empty or longer than 25.") + logger.info("Not a title. Text is empty or longer than 25.") return "" #3 **** @@ -41,12 +41,15 @@ def get_second_level_title( #3.1.1 ***** #另一个分块 #3.1.2 ***** 所以二级目录可能在第二行 和第一行 - #只查找第一个匹配项 + #只查找最后匹配项 Second_TITLE = r'((? bool: # 文本长度为0的话,肯定不是title if len(text) == 0: - print("Not a title. Text is empty.") + logger.info("Not a title. Text is empty.") return False splitlines = text.splitlines() @@ -92,7 +95,7 @@ def get_third_level_title( ) -> str: # 文本长度为0的话,肯定不是title if len(text) == 0: - print("Not a title. Text is empty or longer than 25.") + logger.info("Not a title. Text is empty or longer than 25.") return "" #3 **** @@ -101,7 +104,7 @@ def get_third_level_title( #3.1.1.1 ***** #另一个分块 #3.1.1.2 ***** 所以三级级目录可能在第三行 和第二行及第一行 - #只查找第一个匹配项 + #只查找最后一个匹配项 Third_TITLE = r'((? Document: - title = None - #print(f"zh_third_title_enhance ....") + current_title = None if len(docs) > 0: for doc in docs: - print(f"zh_third_title_enhance: {doc}") + logger.info(f"zh_third_title_enhance: {doc}") third_title = get_third_level_title(doc.page_content) - if third_title: - title = third_title - print(f"title: {title}") - elif title: - print(f"title is not none") + if current_title is None: + if third_title: + current_title = third_title + logger.info(f"third title: {current_title}") + elif current_title is not None: + logger.info(f"third title is not none: {current_title}") temp_fourth_content = is_fourth_level_content(doc.page_content) if temp_fourth_content: - print(f"is_fourth_level_content : {temp_fourth_content}") - doc.page_content = f"{title} {doc.page_content}" + logger.info(f"is_fourth_level_content : {temp_fourth_content}") + doc.page_content = f"{current_title} {doc.page_content}" + logger.info(f"after zh_third_title_enhance: {doc}") + + if third_title and third_title != current_title: + current_title = third_title + logger.info(f"reset third title = {current_title}") + # else: + # logger.info(f"reset second title = none") + # current_title = None else: - title = None - print(f"final third title: {title}") + if third_title and third_title != current_title: + current_title = third_title + logger.info(f"reset third title = {current_title}") + else: + logger.info(f"reset third title = none") + current_title = None return docs else: - print("zh_third_title_enhance 文件不存在") + logger.info("zh_third_title_enhance 文件不存在") #给三级被分开的内容 增加二级标题 def zh_second_title_enhance(docs: Document) -> Document: current_title = None if len(docs) > 0: for doc in docs: - print(f"zh_second_title_enhance: {doc}") + logger.info(f"zh_second_title_enhance: {doc}") second_title = get_second_level_title(doc.page_content) - if second_title: - current_title = second_title - logger.debug(f"title: {current_title}") - continue - if current_title: - print(f"title is not none") + if current_title is None: + if second_title: + current_title = second_title + logger.info(f"second title: {current_title}") + elif current_title is not None: + logger.info(f"second title is not none: {current_title}") temp_third_content = is_third_level_content(doc.page_content) if temp_third_content: - print(f"is_third_level_content : {temp_third_content}") + logger.info(f"is_third_level_content : {temp_third_content}") doc.page_content = f"{current_title} {doc.page_content}" + logger.info(f"after zh_second_title_enhance: {doc}") + + if second_title and second_title != current_title: + current_title = second_title + logger.info(f"reset second title = {current_title}") else: - current_title = None - print(f"final second title: {current_title}") + if second_title and second_title != current_title: + current_title = second_title + logger.info(f"reset second title = {current_title}") + else: + logger.info(f"reset second title = none") + current_title = None return docs else: - print("zh_second_title_enhance 文件不存在") + logger.info("zh_second_title_enhance 文件不存在") #给二级被分开的内容 增加一级标题 def zh_first_title_enhance(docs: Document) -> Document: - title = None + current_title = None if len(docs) > 0: for doc in docs: - logger.debug(f"zh_first_title_enhance: {doc}") + logger.info(f"zh_first_title_enhance: {doc}") first_title = get_fist_level_title(doc.page_content) - if first_title: - title = first_title - logger.debug(f"title: {title}") - elif title: + if current_title is None: + if first_title: + current_title = first_title + logger.info(f"first title: {current_title}") + elif current_title is not None: + logger.info(f"second title is not none: {current_title}") temp_second_content = is_second_level_content(doc.page_content) if temp_second_content: - logger.debug(f"is_second_level_content : {temp_second_content}") - doc.page_content = f"{title} {doc.page_content}" + logger.info(f"is_second_level_content : {temp_second_content}") + doc.page_content = f"{current_title} {doc.page_content}" + logger.info(f"after zh_second_title_enhance: {doc}") + + if first_title and first_title != current_title: + current_title = first_title + logger.info(f"reset first title = {current_title}") else: - title = None - logger.debug(f"final first title: {title}") + if first_title and first_title != current_title: + current_title = first_title + logger.info(f"reset first title = {current_title}") + else: + logger.info(f"reset first title = none") + current_title = None return docs else: - print("zh_first_title_enhance 文件不存在") - + logger.info("zh_first_title_enhance 文件不存在") if __name__ == "__main__": str1 = """1 总 则\n1.1 本导则是编制和审查城市电力网(以下简称城网)规划的指导性文件,其 适用范围为国家电网公司所属的各网省公司、城市供电公司。\n1.2 城网是城市行政区划内为城市供电的各级电压电网的总称。城网是电力系 统的主要负荷中心,作为城市的重要基础设施之一,与城市的社会经济发展密切 相关。各城市应根据《中华人民共和国城市规划法》和《中华人民共和国电力法》 的相关规定,编制城网规划,并纳入相应的城市总体规划和各地区详细规划中。\n1.3 城网规划是城市总体规划的重要组成部分,应与城市的各项发展规划相互 配合、同步实施,做到与城市规划相协调,落实规划中所确定的线路走廊和地下 通道、变电站和配电室站址等供电设施用地。\n1.4 城网规划的目的是通过科学的规划,建设网络坚强、结构合理、安全可靠、 运行灵活、节能环保、经济高效的城市电网,不断提高城网供电能力和电能质量, 以满足城市经济增长和社会发展的需要。 ' metadata={'source': '/home/bns001/Langchain-Chatchat_0.2.9/knowledge_base/test/content/资产全寿命周期管理体系实施指南.docx'}"""