优化第一级目录分款

This commit is contained in:
wvivi2023 2023-12-05 16:51:45 +08:00
parent dce1d16e29
commit c936f040e4
5 changed files with 200 additions and 6 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

BIN
server/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -75,12 +75,20 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
print(f"search_docs, query:{query}")
docs = kb.search_docs(query, top_k, score_threshold)
print(f"search_docs, docs:{docs}")
bFind = False
if len(pre_doc) > 0:
if docs is not None:
docs.append(pre_doc[0])
for tempDoc in docs:
if tempDoc[0].page_content == pre_doc[0][0].page_content:
bFind = True
break
else:
docs = pre_doc[0]
docs = pre_doc[0]
if not bFind:
docs.append(pre_doc[0])
data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
return data

View File

@ -54,13 +54,15 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
new_separators = [SPLIT_SEPARATOE]
new_separators = []
text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块
text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2 这样的章和节来分块
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 再通过 1.2 来分块
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 再通过 1.2 来分块
text = text.rstrip() # 段尾如果有多余的\n就去掉它
for i, _s in enumerate(separators):
_separator = _s if self._is_separator_regex else re.escape(_s)
@ -89,7 +91,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if not new_separators:
final_chunks.append(s)
else:
s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
s = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
other_info = self._split_text(s, new_separators)
final_chunks.extend(other_info)
if _good_splits:

File diff suppressed because one or more lines are too long