增加对四级目录的支持和其他支持目录的标题增强

This commit is contained in:
wvivi2023 2024-02-27 17:24:04 +08:00
parent 7f8855afe0
commit d123ad7c29
4 changed files with 89 additions and 9 deletions

View File

@ -11,6 +11,7 @@ from configs import (
TEXT_SPLITTER_NAME,
)
import importlib
from text_splitter import zh_third_title_enhance
from text_splitter import zh_second_title_enhance
from text_splitter import zh_first_title_enhance
import langchain.document_loaders
@ -371,6 +372,7 @@ class KnowledgeFile:
return []
#先给二级下 被分开的三级目录分块 增加二级标题,再给分开的二级目录增加一级标题,然后给整个文档的所有分块增加文档标题分块
if zh_title_enhance:
docs = zh_third_title_enhance(docs)
docs = zh_second_title_enhance(docs)
docs = zh_first_title_enhance(docs)
docs = customize_zh_title_enhance(docs)

View File

@ -2,5 +2,6 @@ from .chinese_text_splitter import ChineseTextSplitter
from .ali_text_splitter import AliTextSplitter
from .zh_title_enhance import zh_title_enhance
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
from .zh_second_title_enhance import zh_third_title_enhance
from .zh_second_title_enhance import zh_second_title_enhance
from .zh_second_title_enhance import zh_first_title_enhance
from .zh_second_title_enhance import zh_first_title_enhance

View File

@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
First_SEPARATOE = "\n\n\n\n\n\n\n\n\n\n"
Second_SEPARATOE = "\n\n\n\n\n\n\n\n"
Third_SEPARATOE = "\n\n\n\n\n\n"
Fourth_SEPARATOE = "\n\n\n\n"
def _split_text_with_regex_from_end(
text: str, separator: str, keep_separator: bool
) -> List[str]:
@ -41,7 +42,8 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
self._separators = separators or [
First_SEPARATOE,
Second_SEPARATOE,
Third_SEPARATOE
Third_SEPARATOE,
Fourth_SEPARATOE
#"\n\n",
#"\n",
# "。||",
@ -60,21 +62,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
separator = separators[-1]
new_separators = []
if self.is_recursive == False:
#一级目录
text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块
text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章
#text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2
#二级目录
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\n\n\1", text) # 通过\n1.2 这样的章和节来分块
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+第\s*\S+\s*条(:|))', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
#三级目录
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\1", text) # 再通过 1.2.3
text = re.sub(r'(\n+((一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十)))', r"\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))', r"\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+((一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十)))', r"\n\n\n\n\n\n\1", text)
text = re.sub(r'(\n+(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))', r"\n\n\n\n\n\n\1", text)
#四级目录
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\1", text) # 再通过 1.2.3
text = text.rstrip() # 段尾如果有多余的\n就去掉它
self.is_recursive = True
for i, _s in enumerate(separators):

View File

@ -17,7 +17,7 @@ def get_fist_level_title(
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
return ""
FIRST_TITLE = r'((?<!.)\d+[^\S\n]+[^\s\.]+\S+)'
FIRST_TITLE = r'((?<!.)\d+[^\S\n]+[^\s\.]+\S+|(?<!.)第\s*\S+\s*章\s+)'
TITLE_PUNCT_RE = re.compile(FIRST_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return first_line
@ -40,7 +40,7 @@ def get_second_level_title(
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
return ""
Second_TITLE = r'((?<!.)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9])|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))'
TITLE_PUNCT_RE = re.compile(Second_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return first_line
@ -63,7 +63,7 @@ def is_second_level_content(
splitlines = text.splitlines()
first_line = splitlines[0]
Second_TITLE = r'((?<!.)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|(?<!.)(表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)'
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|(?<!.)(表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、)'
TITLE_PUNCT_RE = re.compile(Second_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return True
@ -82,13 +82,83 @@ def is_third_level_content(
splitlines = text.splitlines()
first_line = splitlines[0]
Third_TITLE = r'((?<!.)[0-9]+\s*\.\s*[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|((?<!.)表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)'
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|((?<!.)表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)|((?<!.)(一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十))|((?<!.)(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))'
TITLE_PUNCT_RE = re.compile(Third_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return True
return False
def get_third_level_title(
text: str,
) -> str:
# 文本长度为0的话肯定不是title
if len(text) == 0 and len (text)>= 25:
print("Not a title. Text is empty or longer than 25.")
return ""
splitlines = text.splitlines()
first_line = splitlines[0]
# 文本中有标点符号就不是title
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
return ""
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
TITLE_PUNCT_RE = re.compile(Third_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return first_line
else:
if len(splitlines)>1:
Second_line = splitlines[1]
if TITLE_PUNCT_RE.search(Second_line) is not None:
return Second_line
return ""
#judge if it is 4th level content
def is_fourth_level_content(
text: str,
) -> bool:
# 文本长度为0的话肯定不是title
if len(text) == 0:
print("Not a title. Text is empty.")
return False
splitlines = text.splitlines()
first_line = splitlines[0]
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
TITLE_PUNCT_RE = re.compile(Third_TITLE)
if TITLE_PUNCT_RE.search(first_line) is not None:
return True
return False
#给四级被分开的内容 增加三级标题
def zh_third_title_enhance(docs: Document) -> Document:
title = None
print(f"zh_third_title_enhance ....")
if len(docs) > 0:
for doc in docs:
print(f"zh_third_title_enhance: {doc}")
third_title = get_third_level_title(doc.page_content)
if third_title:
title = third_title
print(f"title: {title}")
elif title:
print(f"title is not none")
temp_fourth_content = is_fourth_level_content(doc.page_content)
if temp_fourth_content:
print(f"is_fourth_level_content : {temp_fourth_content}")
doc.page_content = f"{title} {doc.page_content}"
else:
title = None
print(f"final title: {title}")
return docs
else:
print("zh_third_title_enhance 文件不存在")
#给三级被分开的内容 增加二级标题
def zh_second_title_enhance(docs: Document) -> Document:
title = None