标题加强优化

This commit is contained in:
weiweiw 2025-01-16 18:08:34 +08:00
parent a38364a980
commit a72de9ecc4
1 changed files with 79 additions and 45 deletions

View File

@ -9,7 +9,7 @@ def get_fist_level_title(
) -> bool: ) -> bool:
# 文本长度为0,肯定不是title # 文本长度为0,肯定不是title
if len(text) == 0: if len(text) == 0:
print("Not a title. Text is empty or longer than 25.") logger.info("Not a title. Text is empty or longer than 25.")
return "" return ""
splitlines = text.splitlines() splitlines = text.splitlines()
@ -33,7 +33,7 @@ def get_second_level_title(
# 文本长度为0的话肯定不是title # 文本长度为0的话肯定不是title
lenght = len(text) lenght = len(text)
if lenght == 0: if lenght == 0:
print("Not a title. Text is empty or longer than 25.") logger.info("Not a title. Text is empty or longer than 25.")
return "" return ""
#3 **** #3 ****
@ -41,12 +41,15 @@ def get_second_level_title(
#3.1.1 ***** #3.1.1 *****
#另一个分块 #另一个分块
#3.1.2 ***** 所以二级目录可能在第二行 和第一行 #3.1.2 ***** 所以二级目录可能在第二行 和第一行
#只查找第一个匹配项 #只查找最后匹配项
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9])|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))' Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9])|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))'
TITLE_PUNCT_RE = re.compile(Second_TITLE) TITLE_PUNCT_RE = re.compile(Second_TITLE)
match = TITLE_PUNCT_RE.search(text) # match = TITLE_PUNCT_RE.search(text)
if match: # if match:
return match.group(1) # return match.group(1)
matches = TITLE_PUNCT_RE.findall(text)
if matches:
return matches[-1][0]
return "" return ""
#judge if it is 2nd level content #judge if it is 2nd level content
@ -74,7 +77,7 @@ def is_third_level_content(
) -> bool: ) -> bool:
# 文本长度为0的话肯定不是title # 文本长度为0的话肯定不是title
if len(text) == 0: if len(text) == 0:
print("Not a title. Text is empty.") logger.info("Not a title. Text is empty.")
return False return False
splitlines = text.splitlines() splitlines = text.splitlines()
@ -92,7 +95,7 @@ def get_third_level_title(
) -> str: ) -> str:
# 文本长度为0的话肯定不是title # 文本长度为0的话肯定不是title
if len(text) == 0: if len(text) == 0:
print("Not a title. Text is empty or longer than 25.") logger.info("Not a title. Text is empty or longer than 25.")
return "" return ""
#3 **** #3 ****
@ -101,7 +104,7 @@ def get_third_level_title(
#3.1.1.1 ***** #3.1.1.1 *****
#另一个分块 #另一个分块
#3.1.1.2 ***** 所以三级级目录可能在第三行 和第二行及第一行 #3.1.1.2 ***** 所以三级级目录可能在第三行 和第二行及第一行
#只查找一个匹配项 #只查找最后一个匹配项
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))' Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
TITLE_PUNCT_RE = re.compile(Third_TITLE) TITLE_PUNCT_RE = re.compile(Third_TITLE)
match = TITLE_PUNCT_RE.search(text) # 只查找第一个匹配项 match = TITLE_PUNCT_RE.search(text) # 只查找第一个匹配项
@ -143,74 +146,105 @@ def is_fourth_level_content(
#给四级被分开的内容 增加三级标题 #给四级被分开的内容 增加三级标题
def zh_third_title_enhance(docs: Document) -> Document: def zh_third_title_enhance(docs: Document) -> Document:
title = None current_title = None
#print(f"zh_third_title_enhance ....")
if len(docs) > 0: if len(docs) > 0:
for doc in docs: for doc in docs:
print(f"zh_third_title_enhance: {doc}") logger.info(f"zh_third_title_enhance: {doc}")
third_title = get_third_level_title(doc.page_content) third_title = get_third_level_title(doc.page_content)
if current_title is None:
if third_title: if third_title:
title = third_title current_title = third_title
print(f"title: {title}") logger.info(f"third title: {current_title}")
elif title: elif current_title is not None:
print(f"title is not none") logger.info(f"third title is not none: {current_title}")
temp_fourth_content = is_fourth_level_content(doc.page_content) temp_fourth_content = is_fourth_level_content(doc.page_content)
if temp_fourth_content: if temp_fourth_content:
print(f"is_fourth_level_content : {temp_fourth_content}") logger.info(f"is_fourth_level_content : {temp_fourth_content}")
doc.page_content = f"{title} {doc.page_content}" doc.page_content = f"{current_title} {doc.page_content}"
logger.info(f"after zh_third_title_enhance: {doc}")
if third_title and third_title != current_title:
current_title = third_title
logger.info(f"reset third title = {current_title}")
# else:
# logger.info(f"reset second title = none")
# current_title = None
else: else:
title = None if third_title and third_title != current_title:
print(f"final third title: {title}") current_title = third_title
logger.info(f"reset third title = {current_title}")
else:
logger.info(f"reset third title = none")
current_title = None
return docs return docs
else: else:
print("zh_third_title_enhance 文件不存在") logger.info("zh_third_title_enhance 文件不存在")
#给三级被分开的内容 增加二级标题 #给三级被分开的内容 增加二级标题
def zh_second_title_enhance(docs: Document) -> Document: def zh_second_title_enhance(docs: Document) -> Document:
current_title = None current_title = None
if len(docs) > 0: if len(docs) > 0:
for doc in docs: for doc in docs:
print(f"zh_second_title_enhance: {doc}") logger.info(f"zh_second_title_enhance: {doc}")
second_title = get_second_level_title(doc.page_content) second_title = get_second_level_title(doc.page_content)
if current_title is None:
if second_title: if second_title:
current_title = second_title current_title = second_title
logger.debug(f"title: {current_title}") logger.info(f"second title: {current_title}")
continue elif current_title is not None:
if current_title: logger.info(f"second title is not none: {current_title}")
print(f"title is not none")
temp_third_content = is_third_level_content(doc.page_content) temp_third_content = is_third_level_content(doc.page_content)
if temp_third_content: if temp_third_content:
print(f"is_third_level_content : {temp_third_content}") logger.info(f"is_third_level_content : {temp_third_content}")
doc.page_content = f"{current_title} {doc.page_content}" doc.page_content = f"{current_title} {doc.page_content}"
logger.info(f"after zh_second_title_enhance: {doc}")
if second_title and second_title != current_title:
current_title = second_title
logger.info(f"reset second title = {current_title}")
else: else:
if second_title and second_title != current_title:
current_title = second_title
logger.info(f"reset second title = {current_title}")
else:
logger.info(f"reset second title = none")
current_title = None current_title = None
print(f"final second title: {current_title}")
return docs return docs
else: else:
print("zh_second_title_enhance 文件不存在") logger.info("zh_second_title_enhance 文件不存在")
#给二级被分开的内容 增加一级标题 #给二级被分开的内容 增加一级标题
def zh_first_title_enhance(docs: Document) -> Document: def zh_first_title_enhance(docs: Document) -> Document:
title = None current_title = None
if len(docs) > 0: if len(docs) > 0:
for doc in docs: for doc in docs:
logger.debug(f"zh_first_title_enhance: {doc}") logger.info(f"zh_first_title_enhance: {doc}")
first_title = get_fist_level_title(doc.page_content) first_title = get_fist_level_title(doc.page_content)
if current_title is None:
if first_title: if first_title:
title = first_title current_title = first_title
logger.debug(f"title: {title}") logger.info(f"first title: {current_title}")
elif title: elif current_title is not None:
logger.info(f"second title is not none: {current_title}")
temp_second_content = is_second_level_content(doc.page_content) temp_second_content = is_second_level_content(doc.page_content)
if temp_second_content: if temp_second_content:
logger.debug(f"is_second_level_content : {temp_second_content}") logger.info(f"is_second_level_content : {temp_second_content}")
doc.page_content = f"{title} {doc.page_content}" doc.page_content = f"{current_title} {doc.page_content}"
logger.info(f"after zh_second_title_enhance: {doc}")
if first_title and first_title != current_title:
current_title = first_title
logger.info(f"reset first title = {current_title}")
else: else:
title = None if first_title and first_title != current_title:
logger.debug(f"final first title: {title}") current_title = first_title
logger.info(f"reset first title = {current_title}")
else:
logger.info(f"reset first title = none")
current_title = None
return docs return docs
else: else:
print("zh_first_title_enhance 文件不存在") logger.info("zh_first_title_enhance 文件不存在")
if __name__ == "__main__": if __name__ == "__main__":
str1 = """1 总 则\n1.1 本导则是编制和审查城市电力网(以下简称城网)规划的指导性文件,其 适用范围为国家电网公司所属的各网省公司、城市供电公司。\n1.2 城网是城市行政区划内为城市供电的各级电压电网的总称。城网是电力系 统的主要负荷中心,作为城市的重要基础设施之一,与城市的社会经济发展密切 相关。各城市应根据《中华人民共和国城市规划法》和《中华人民共和国电力法》 的相关规定,编制城网规划,并纳入相应的城市总体规划和各地区详细规划中。\n1.3 城网规划是城市总体规划的重要组成部分,应与城市的各项发展规划相互 配合、同步实施,做到与城市规划相协调,落实规划中所确定的线路走廊和地下 通道、变电站和配电室站址等供电设施用地。\n1.4 城网规划的目的是通过科学的规划,建设网络坚强、结构合理、安全可靠、 运行灵活、节能环保、经济高效的城市电网,不断提高城网供电能力和电能质量, 以满足城市经济增长和社会发展的需要。 ' metadata={'source': '/home/bns001/Langchain-Chatchat_0.2.9/knowledge_base/test/content/资产全寿命周期管理体系实施指南.docx'}""" str1 = """1 总 则\n1.1 本导则是编制和审查城市电力网(以下简称城网)规划的指导性文件,其 适用范围为国家电网公司所属的各网省公司、城市供电公司。\n1.2 城网是城市行政区划内为城市供电的各级电压电网的总称。城网是电力系 统的主要负荷中心,作为城市的重要基础设施之一,与城市的社会经济发展密切 相关。各城市应根据《中华人民共和国城市规划法》和《中华人民共和国电力法》 的相关规定,编制城网规划,并纳入相应的城市总体规划和各地区详细规划中。\n1.3 城网规划是城市总体规划的重要组成部分,应与城市的各项发展规划相互 配合、同步实施,做到与城市规划相协调,落实规划中所确定的线路走廊和地下 通道、变电站和配电室站址等供电设施用地。\n1.4 城网规划的目的是通过科学的规划,建设网络坚强、结构合理、安全可靠、 运行灵活、节能环保、经济高效的城市电网,不断提高城网供电能力和电能质量, 以满足城市经济增长和社会发展的需要。 ' metadata={'source': '/home/bns001/Langchain-Chatchat_0.2.9/knowledge_base/test/content/资产全寿命周期管理体系实施指南.docx'}"""