2023-12-28 10:52:52 +08:00
|
|
|
|
from langchain.docstore.document import Document
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
def get_fist_level_title(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
) -> bool:
|
2024-02-28 16:40:45 +08:00
|
|
|
|
# 文本长度为0,肯定不是title
|
|
|
|
|
|
if len(text) == 0:
|
2023-12-28 10:52:52 +08:00
|
|
|
|
print("Not a title. Text is empty or longer than 25.")
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
splitlines = text.splitlines()
|
|
|
|
|
|
first_line = splitlines[0]
|
|
|
|
|
|
# 文本中有标点符号,就不是title
|
|
|
|
|
|
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
|
|
|
|
|
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
|
|
|
|
|
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return ""
|
2024-02-28 16:40:45 +08:00
|
|
|
|
FIRST_TITLE = r'((?<!.)\d+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9])|((?<!.)第\s*\S+\s*章\s+\S+))'
|
2023-12-28 10:52:52 +08:00
|
|
|
|
TITLE_PUNCT_RE = re.compile(FIRST_TITLE)
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return first_line
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
#return the 2nd level title
|
|
|
|
|
|
def get_second_level_title(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
# 文本长度为0的话,肯定不是title
|
2024-02-28 16:40:45 +08:00
|
|
|
|
lenght = len(text)
|
|
|
|
|
|
if lenght == 0:
|
2023-12-28 10:52:52 +08:00
|
|
|
|
print("Not a title. Text is empty or longer than 25.")
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
splitlines = text.splitlines()
|
|
|
|
|
|
first_line = splitlines[0]
|
|
|
|
|
|
# 文本中有标点符号,就不是title
|
2024-03-12 11:09:37 +08:00
|
|
|
|
# ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
|
|
|
|
|
# ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
|
|
|
|
|
# if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
# return ""
|
2024-02-28 16:40:45 +08:00
|
|
|
|
|
|
|
|
|
|
#3 ****
|
|
|
|
|
|
#3.1 *****
|
|
|
|
|
|
#3.1.1 *****
|
|
|
|
|
|
#另一个分块
|
|
|
|
|
|
#3.1.2 ***** 所以二级目录可能在第二行 和第一行
|
2024-02-27 17:24:04 +08:00
|
|
|
|
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9])|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|:)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))'
|
2023-12-28 10:52:52 +08:00
|
|
|
|
TITLE_PUNCT_RE = re.compile(Second_TITLE)
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return first_line
|
|
|
|
|
|
else:
|
|
|
|
|
|
if len(splitlines)>1:
|
|
|
|
|
|
Second_line = splitlines[1]
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(Second_line) is not None:
|
|
|
|
|
|
return Second_line
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
2024-01-19 15:54:26 +08:00
|
|
|
|
#judge if it is 2nd level content
|
|
|
|
|
|
def is_second_level_content(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
# 文本长度为0的话,肯定不是title
|
|
|
|
|
|
if len(text) == 0:
|
|
|
|
|
|
print("Not a title. Text is empty.")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
splitlines = text.splitlines()
|
|
|
|
|
|
first_line = splitlines[0]
|
|
|
|
|
|
|
2024-02-27 17:24:04 +08:00
|
|
|
|
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|(?<!.)(表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|:)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、)'
|
2024-01-19 15:54:26 +08:00
|
|
|
|
TITLE_PUNCT_RE = re.compile(Second_TITLE)
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
2023-12-28 10:52:52 +08:00
|
|
|
|
#judge if it is 3rd level content
|
|
|
|
|
|
def is_third_level_content(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
# 文本长度为0的话,肯定不是title
|
|
|
|
|
|
if len(text) == 0:
|
|
|
|
|
|
print("Not a title. Text is empty.")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
splitlines = text.splitlines()
|
|
|
|
|
|
first_line = splitlines[0]
|
|
|
|
|
|
|
2024-02-28 16:40:45 +08:00
|
|
|
|
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|((?<!.)表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)|((?<!.)(一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十))|((?<!.)(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))'
|
2023-12-28 10:52:52 +08:00
|
|
|
|
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
2024-02-27 17:24:04 +08:00
|
|
|
|
|
|
|
|
|
|
def get_third_level_title(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
) -> str:
|
|
|
|
|
|
# 文本长度为0的话,肯定不是title
|
2024-02-28 16:40:45 +08:00
|
|
|
|
if len(text) == 0:
|
2024-02-27 17:24:04 +08:00
|
|
|
|
print("Not a title. Text is empty or longer than 25.")
|
|
|
|
|
|
return ""
|
2023-12-28 10:52:52 +08:00
|
|
|
|
|
2024-02-27 17:24:04 +08:00
|
|
|
|
splitlines = text.splitlines()
|
|
|
|
|
|
first_line = splitlines[0]
|
|
|
|
|
|
# 文本中有标点符号,就不是title
|
2024-03-12 11:09:37 +08:00
|
|
|
|
# ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
|
|
|
|
|
# ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
|
|
|
|
|
# if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
# return ""
|
2024-02-28 16:40:45 +08:00
|
|
|
|
|
|
|
|
|
|
#3 ****
|
|
|
|
|
|
#3.1 *****
|
|
|
|
|
|
#3.1.1 *****
|
|
|
|
|
|
#3.1.1.1 *****
|
|
|
|
|
|
#另一个分块
|
|
|
|
|
|
#3.1.1.2 ***** 所以三级级目录可能在第三行 和第二行及第一行
|
2024-02-27 17:24:04 +08:00
|
|
|
|
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
|
|
|
|
|
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return first_line
|
|
|
|
|
|
else:
|
|
|
|
|
|
if len(splitlines)>1:
|
|
|
|
|
|
Second_line = splitlines[1]
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(Second_line) is not None:
|
|
|
|
|
|
return Second_line
|
2024-02-28 16:40:45 +08:00
|
|
|
|
else:
|
|
|
|
|
|
if len(splitlines)>2:
|
|
|
|
|
|
Second_line = splitlines[2]
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(Second_line) is not None:
|
|
|
|
|
|
return Second_line
|
|
|
|
|
|
|
2024-02-27 17:24:04 +08:00
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
#judge if it is 4th level content
|
|
|
|
|
|
def is_fourth_level_content(
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
# 文本长度为0的话,肯定不是title
|
|
|
|
|
|
if len(text) == 0:
|
|
|
|
|
|
print("Not a title. Text is empty.")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
splitlines = text.splitlines()
|
|
|
|
|
|
first_line = splitlines[0]
|
|
|
|
|
|
|
|
|
|
|
|
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
|
|
|
|
|
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
|
|
|
|
|
if TITLE_PUNCT_RE.search(first_line) is not None:
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
#给四级被分开的内容 增加三级标题
|
|
|
|
|
|
def zh_third_title_enhance(docs: Document) -> Document:
|
|
|
|
|
|
title = None
|
2024-03-18 09:28:30 +08:00
|
|
|
|
#print(f"zh_third_title_enhance ....")
|
2024-02-27 17:24:04 +08:00
|
|
|
|
if len(docs) > 0:
|
|
|
|
|
|
for doc in docs:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"zh_third_title_enhance: {doc}")
|
2024-02-27 17:24:04 +08:00
|
|
|
|
third_title = get_third_level_title(doc.page_content)
|
|
|
|
|
|
if third_title:
|
|
|
|
|
|
title = third_title
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"title: {title}")
|
2024-02-27 17:24:04 +08:00
|
|
|
|
elif title:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"title is not none")
|
2024-02-27 17:24:04 +08:00
|
|
|
|
temp_fourth_content = is_fourth_level_content(doc.page_content)
|
|
|
|
|
|
if temp_fourth_content:
|
2024-03-06 14:50:45 +08:00
|
|
|
|
#print(f"is_fourth_level_content : {temp_fourth_content}")
|
2024-02-27 17:24:04 +08:00
|
|
|
|
doc.page_content = f"{title} {doc.page_content}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
title = None
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"final title: {title}")
|
2024-02-27 17:24:04 +08:00
|
|
|
|
return docs
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("zh_third_title_enhance 文件不存在")
|
|
|
|
|
|
|
2023-12-28 10:52:52 +08:00
|
|
|
|
#给三级被分开的内容 增加二级标题
|
|
|
|
|
|
def zh_second_title_enhance(docs: Document) -> Document:
|
|
|
|
|
|
title = None
|
|
|
|
|
|
if len(docs) > 0:
|
|
|
|
|
|
for doc in docs:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"zh_second_title_enhance: {doc}")
|
2023-12-28 10:52:52 +08:00
|
|
|
|
second_title = get_second_level_title(doc.page_content)
|
|
|
|
|
|
if second_title:
|
|
|
|
|
|
title = second_title
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"title: {title}")
|
2023-12-28 10:52:52 +08:00
|
|
|
|
elif title:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"title is not none")
|
2023-12-28 10:52:52 +08:00
|
|
|
|
temp_third_content = is_third_level_content(doc.page_content)
|
|
|
|
|
|
if temp_third_content:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"is_third_level_content : {temp_third_content}")
|
2023-12-28 10:52:52 +08:00
|
|
|
|
doc.page_content = f"{title} {doc.page_content}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
title = None
|
2024-03-18 09:28:30 +08:00
|
|
|
|
print(f"final title: {title}")
|
2023-12-28 10:52:52 +08:00
|
|
|
|
return docs
|
|
|
|
|
|
else:
|
2024-01-19 15:54:26 +08:00
|
|
|
|
print("zh_second_title_enhance 文件不存在")
|
|
|
|
|
|
|
|
|
|
|
|
#给二级被分开的内容 增加一级标题
|
|
|
|
|
|
def zh_first_title_enhance(docs: Document) -> Document:
|
|
|
|
|
|
title = None
|
|
|
|
|
|
if len(docs) > 0:
|
|
|
|
|
|
for doc in docs:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"zh_first_title_enhance: {doc}")
|
2024-01-19 15:54:26 +08:00
|
|
|
|
first_title = get_fist_level_title(doc.page_content)
|
|
|
|
|
|
if first_title:
|
|
|
|
|
|
title = first_title
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"title: {title}")
|
2024-01-19 15:54:26 +08:00
|
|
|
|
elif title:
|
|
|
|
|
|
temp_second_content = is_second_level_content(doc.page_content)
|
|
|
|
|
|
if temp_second_content:
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"is_second_level_content : {temp_second_content}")
|
2024-01-19 15:54:26 +08:00
|
|
|
|
doc.page_content = f"{title} {doc.page_content}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
title = None
|
2024-04-01 13:54:24 +08:00
|
|
|
|
#print(f"final title: {title}")
|
2024-01-19 15:54:26 +08:00
|
|
|
|
return docs
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("zh_first_title_enhance 文件不存在")
|
2023-12-28 10:52:52 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-02-28 16:40:45 +08:00
|
|
|
|
str = """1 总 则\n1.1 本导则是编制和审查城市电力网(以下简称城网)规划的指导性文件,其 适用范围为国家电网公司所属的各网省公司、城市供电公司。\n1.2 城网是城市行政区划内为城市供电的各级电压电网的总称。城网是电力系 统的主要负荷中心,作为城市的重要基础设施之一,与城市的社会经济发展密切 相关。各城市应根据《中华人民共和国城市规划法》和《中华人民共和国电力法》 的相关规定,编制城网规划,并纳入相应的城市总体规划和各地区详细规划中。\n1.3 城网规划是城市总体规划的重要组成部分,应与城市的各项发展规划相互 配合、同步实施,做到与城市规划相协调,落实规划中所确定的线路走廊和地下 通道、变电站和配电室站址等供电设施用地。\n1.4 城网规划的目的是通过科学的规划,建设网络坚强、结构合理、安全可靠、 运行灵活、节能环保、经济高效的城市电网,不断提高城网供电能力和电能质量, 以满足城市经济增长和社会发展的需要。 ' metadata={'source': '/home/bns001/Langchain-Chatchat_0.2.9/knowledge_base/test/content/资产全寿命周期管理体系实施指南.docx'}"""
|
|
|
|
|
|
title = get_fist_level_title(str)
|
2023-12-28 10:52:52 +08:00
|
|
|
|
print(title)
|
2024-02-28 16:40:45 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2023-12-28 10:52:52 +08:00
|
|
|
|
#zh_second_title_enhance()
|