enhance 3rd catalog content

This commit is contained in:
wvivi2023 2023-12-28 10:52:52 +08:00
parent 9f327e71e4
commit 5c8610f47f
5 changed files with 170 additions and 26 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -13,7 +13,7 @@ from configs import (
TEXT_SPLITTER_NAME, TEXT_SPLITTER_NAME,
) )
import importlib import importlib
from text_splitter import zh_title_enhance as func_zh_title_enhance from text_splitter import zh_second_title_enhance
import langchain.document_loaders import langchain.document_loaders
from langchain.document_loaders.word_document import Docx2txtLoader from langchain.document_loaders.word_document import Docx2txtLoader
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -355,6 +355,7 @@ class KnowledgeFile:
i = i+1 i = i+1
if zh_title_enhance: if zh_title_enhance:
docs = zh_second_title_enhance(docs)
docs = customize_zh_title_enhance(docs) docs = customize_zh_title_enhance(docs)
i = 1 i = 1
outputfile = file_name_without_extension + "_split.txt" outputfile = file_name_without_extension + "_split.txt"

View File

@ -1,4 +1,5 @@
from .chinese_text_splitter import ChineseTextSplitter from .chinese_text_splitter import ChineseTextSplitter
from .ali_text_splitter import AliTextSplitter from .ali_text_splitter import AliTextSplitter
from .zh_title_enhance import zh_title_enhance from .zh_title_enhance import zh_title_enhance
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
from .zh_second_title_enhance import zh_second_title_enhance

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,113 @@
from langchain.docstore.document import Document
import re
def get_fist_level_title(
text: str,
) -> bool:
# 文本长度为0的话或长度大于25肯定不是title
if len(text) == 0 and len (text)>= 25:
print("Not a title. Text is empty or longer than 25.")
return ""
splitlines = text.splitlines()
first_line = splitlines[0]
# 文本中有标点符号就不是title
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
return ""
FIRST_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)\d+[^\S\n]+[^\s\.]+\S+)'
TITLE_PUNCT_RE = re.compile(FIRST_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return first_line
return ""
#return the 2nd level title
def get_second_level_title(
text: str,
) -> str:
# 文本长度为0的话肯定不是title
if len(text) == 0 and len (text)>= 25:
print("Not a title. Text is empty or longer than 25.")
return ""
splitlines = text.splitlines()
first_line = splitlines[0]
# 文本中有标点符号就不是title
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
return ""
Second_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
TITLE_PUNCT_RE = re.compile(Second_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return first_line
else:
if len(splitlines)>1:
Second_line = splitlines[1]
if TITLE_PUNCT_RE.search(Second_line) is not None:
return Second_line
return ""
#judge if it is 3rd level content
def is_third_level_content(
text: str,
) -> bool:
# 文本长度为0的话肯定不是title
if len(text) == 0:
print("Not a title. Text is empty.")
return False
splitlines = text.splitlines()
first_line = splitlines[0]
Third_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)\s*[0-9]+\s*\.\s*[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
TITLE_PUNCT_RE = re.compile(Third_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return True
return False
#给三级被分开的内容 增加二级标题
def zh_second_title_enhance(docs: Document) -> Document:
title = None
if len(docs) > 0:
for doc in docs:
second_title = get_second_level_title(doc.page_content)
if second_title:
title = second_title
elif title:
temp_third_content = is_third_level_content(doc.page_content)
if temp_third_content:
doc.page_content = f"{title} {doc.page_content}"
else:
title = None
return docs
else:
print("文件不存在")
if __name__ == "__main__":
str = """6 进出等电位
6.1 直线塔进出等电位
6.1.1 对于直线塔 作业人员不得从横担或绝缘子串垂直进出等电位 可采用吊篮吊椅吊梯 绝缘软梯法等方式进出等电位
6.1.2 等电位作业人员进出等电位时与接地体及带电体的各电气间隙距离包括安全距离组合间隙 均应满足表 1 3 要求
6.1.3 吊篮吊椅吊梯必须用吊拉绳索稳固悬吊 吊篮吊椅吊梯的移动速度必须用绝缘滑 车组严格控制 做到均匀慢速 固定吊拉绳索的长度 应准确计算或实际丈量 保证等电位作业人员 即将进入等电位时人体最高部位不超过导线侧均压环
6.2 耐张塔进出等电位
6.2.1 在耐张塔进出等电位时作业人员可采用沿耐张绝缘子串方法或其它方法进出等电位
6.2.2 等电位作业人员沿绝缘子串移动时 手与脚的位置必须保持对应一致 且人体和工具短接的绝 缘子片数应符合 5.2.2 的要求
6.2.3 等电位作业人员所系安全带应绑在手扶的绝缘子串上并与等电位作业人员同步移动
6.2.4 等电位作业人员在进出等电位时应在移动至距离带电体 3 片绝缘子时进行电位转移方可进 行后续操作
6.2.5 带电作业人员与接地体及带电体的各电气间隙距离包括安全距离组合间隙和经人体或工 具短接后的良好绝缘子片数均应满足表 4 要求否则不能沿耐张绝缘子串进出等电位
7 作业中的注意事项
7.1 等电位作业人员在带电作业过程中时应避免身体动作幅度过大
7.2 等电位作业人员与地电位作业人员之间传递物品应采用绝缘工具绝缘工具的有效长度应满足 2 的规定
7.3 屏蔽服装应无破损和孔洞 各部分应连接良好可靠发现破损和毛刺时应送有资质的试验单位 进行屏蔽服装电阻和屏蔽效率测量测量结果满足本标准 5.3.1 条的要求后方可使用
7.4 绝缘工具在使用前 应使用 2500V 及以上兆欧表进行分段检测电极宽 2cm极间宽 2cm阻值 不低于 700MΩ"""
title = is_third_level_content(str)
print(title)
title = get_second_level_title(str)
print(title)
#zh_second_title_enhance()