增加对四级目录的支持和其他支持目录的标题增强
This commit is contained in:
parent
7f8855afe0
commit
d123ad7c29
|
|
@ -11,6 +11,7 @@ from configs import (
|
|||
TEXT_SPLITTER_NAME,
|
||||
)
|
||||
import importlib
|
||||
from text_splitter import zh_third_title_enhance
|
||||
from text_splitter import zh_second_title_enhance
|
||||
from text_splitter import zh_first_title_enhance
|
||||
import langchain.document_loaders
|
||||
|
|
@ -371,6 +372,7 @@ class KnowledgeFile:
|
|||
return []
|
||||
#先给二级下 被分开的三级目录分块 增加二级标题,再给分开的二级目录增加一级标题,然后给整个文档的所有分块增加文档标题分块
|
||||
if zh_title_enhance:
|
||||
docs = zh_third_title_enhance(docs)
|
||||
docs = zh_second_title_enhance(docs)
|
||||
docs = zh_first_title_enhance(docs)
|
||||
docs = customize_zh_title_enhance(docs)
|
||||
|
|
|
|||
|
|
@ -2,5 +2,6 @@ from .chinese_text_splitter import ChineseTextSplitter
|
|||
from .ali_text_splitter import AliTextSplitter
|
||||
from .zh_title_enhance import zh_title_enhance
|
||||
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
|
||||
from .zh_second_title_enhance import zh_third_title_enhance
|
||||
from .zh_second_title_enhance import zh_second_title_enhance
|
||||
from .zh_second_title_enhance import zh_first_title_enhance
|
||||
from .zh_second_title_enhance import zh_first_title_enhance
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ logger = logging.getLogger(__name__)
|
|||
First_SEPARATOE = "\n\n\n\n\n\n\n\n\n\n"
|
||||
Second_SEPARATOE = "\n\n\n\n\n\n\n\n"
|
||||
Third_SEPARATOE = "\n\n\n\n\n\n"
|
||||
Fourth_SEPARATOE = "\n\n\n\n"
|
||||
def _split_text_with_regex_from_end(
|
||||
text: str, separator: str, keep_separator: bool
|
||||
) -> List[str]:
|
||||
|
|
@ -41,7 +42,8 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
self._separators = separators or [
|
||||
First_SEPARATOE,
|
||||
Second_SEPARATOE,
|
||||
Third_SEPARATOE
|
||||
Third_SEPARATOE,
|
||||
Fourth_SEPARATOE
|
||||
#"\n\n",
|
||||
#"\n",
|
||||
# "。|!|?",
|
||||
|
|
@ -60,21 +62,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
separator = separators[-1]
|
||||
new_separators = []
|
||||
if self.is_recursive == False:
|
||||
#一级目录
|
||||
text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块
|
||||
text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
|
||||
text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换
|
||||
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章
|
||||
#text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2
|
||||
|
||||
#二级目录
|
||||
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2
|
||||
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\n\n\1", text) # 通过\n1.2 这样的章和节来分块
|
||||
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+第\s*\S+\s*条(:|:))', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
|
||||
#三级目录
|
||||
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\1", text) # 再通过 1.2.3
|
||||
text = re.sub(r'(\n+((一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十)))', r"\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))', r"\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+((一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十)))', r"\n\n\n\n\n\n\1", text)
|
||||
text = re.sub(r'(\n+(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))', r"\n\n\n\n\n\n\1", text)
|
||||
|
||||
#四级目录
|
||||
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\1", text) # 再通过 1.2.3
|
||||
text = text.rstrip() # 段尾如果有多余的\n就去掉它
|
||||
self.is_recursive = True
|
||||
for i, _s in enumerate(separators):
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ def get_fist_level_title(
|
|||
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
||||
return ""
|
||||
|
||||
FIRST_TITLE = r'((?<!.)\d+[^\S\n]+[^\s\.]+\S+)'
|
||||
FIRST_TITLE = r'((?<!.)\d+[^\S\n]+[^\s\.]+\S+|(?<!.)第\s*\S+\s*章\s+)'
|
||||
TITLE_PUNCT_RE = re.compile(FIRST_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return first_line
|
||||
|
|
@ -40,7 +40,7 @@ def get_second_level_title(
|
|||
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
||||
return ""
|
||||
|
||||
Second_TITLE = r'((?<!.)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
||||
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9])|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|:)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))'
|
||||
TITLE_PUNCT_RE = re.compile(Second_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return first_line
|
||||
|
|
@ -63,7 +63,7 @@ def is_second_level_content(
|
|||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
|
||||
Second_TITLE = r'((?<!.)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|(?<!.)(表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)'
|
||||
Second_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|(?<!.)(表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)|(?<!.)第\s*\S+\s*条\s+|(?<!.)第\s*\S+\s*条(:|:)|(?<!.)(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、)'
|
||||
TITLE_PUNCT_RE = re.compile(Second_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return True
|
||||
|
|
@ -82,13 +82,83 @@ def is_third_level_content(
|
|||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
|
||||
Third_TITLE = r'((?<!.)[0-9]+\s*\.\s*[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|((?<!.)表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)'
|
||||
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))|((?<!.)表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)|((?<!.)(一)|(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|(十六)|(十七)|(十八)|(十九)|(二十))|((?<!.)(\(一\)|\(二\)|\(三\)|\(四\)|\(五\)|\(六\)|\(七\)|\(八\)|\(九\)|\(十\)|\(十一\)|\(十二\)|\(十三\)|\(十四\)|\(十五\)|\(十六\)|\(十七\)|\(十八\)|\(十九\)|\(二十\)))'
|
||||
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_third_level_title(
|
||||
text: str,
|
||||
) -> str:
|
||||
# 文本长度为0的话,肯定不是title
|
||||
if len(text) == 0 and len (text)>= 25:
|
||||
print("Not a title. Text is empty or longer than 25.")
|
||||
return ""
|
||||
|
||||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
# 文本中有标点符号,就不是title
|
||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
||||
return ""
|
||||
|
||||
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
||||
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return first_line
|
||||
else:
|
||||
if len(splitlines)>1:
|
||||
Second_line = splitlines[1]
|
||||
if TITLE_PUNCT_RE.search(Second_line) is not None:
|
||||
return Second_line
|
||||
return ""
|
||||
|
||||
#judge if it is 4th level content
|
||||
def is_fourth_level_content(
|
||||
text: str,
|
||||
) -> bool:
|
||||
# 文本长度为0的话,肯定不是title
|
||||
if len(text) == 0:
|
||||
print("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
|
||||
Third_TITLE = r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
||||
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
#给四级被分开的内容 增加三级标题
|
||||
def zh_third_title_enhance(docs: Document) -> Document:
|
||||
title = None
|
||||
print(f"zh_third_title_enhance ....")
|
||||
if len(docs) > 0:
|
||||
for doc in docs:
|
||||
print(f"zh_third_title_enhance: {doc}")
|
||||
third_title = get_third_level_title(doc.page_content)
|
||||
if third_title:
|
||||
title = third_title
|
||||
print(f"title: {title}")
|
||||
elif title:
|
||||
print(f"title is not none")
|
||||
temp_fourth_content = is_fourth_level_content(doc.page_content)
|
||||
if temp_fourth_content:
|
||||
print(f"is_fourth_level_content : {temp_fourth_content}")
|
||||
doc.page_content = f"{title} {doc.page_content}"
|
||||
else:
|
||||
title = None
|
||||
print(f"final title: {title}")
|
||||
return docs
|
||||
else:
|
||||
print("zh_third_title_enhance 文件不存在")
|
||||
|
||||
#给三级被分开的内容 增加二级标题
|
||||
def zh_second_title_enhance(docs: Document) -> Document:
|
||||
title = None
|
||||
|
|
|
|||
Loading…
Reference in New Issue