优化第一级目录分款
This commit is contained in:
parent
dce1d16e29
commit
c936f040e4
Binary file not shown.
|
|
@ -75,12 +75,20 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
|
|||
print(f"search_docs, query:{query}")
|
||||
docs = kb.search_docs(query, top_k, score_threshold)
|
||||
print(f"search_docs, docs:{docs}")
|
||||
|
||||
bFind = False
|
||||
if len(pre_doc) > 0:
|
||||
if docs is not None:
|
||||
docs.append(pre_doc[0])
|
||||
for tempDoc in docs:
|
||||
if tempDoc[0].page_content == pre_doc[0][0].page_content:
|
||||
bFind = True
|
||||
break
|
||||
else:
|
||||
docs = pre_doc[0]
|
||||
|
||||
docs = pre_doc[0]
|
||||
|
||||
if not bFind:
|
||||
docs.append(pre_doc[0])
|
||||
|
||||
data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
|
||||
|
||||
return data
|
||||
|
|
|
|||
|
|
@ -54,13 +54,15 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
final_chunks = []
|
||||
# Get appropriate separator to use
|
||||
separator = separators[-1]
|
||||
new_separators = [SPLIT_SEPARATOE]
|
||||
new_separators = []
|
||||
text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块
|
||||
text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
|
||||
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2 这样的章和节来分块
|
||||
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a
|
||||
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 再通过 1.2 来分块
|
||||
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 再通过 1.2 来分块
|
||||
text = text.rstrip() # 段尾如果有多余的\n就去掉它
|
||||
for i, _s in enumerate(separators):
|
||||
_separator = _s if self._is_separator_regex else re.escape(_s)
|
||||
|
|
@ -89,7 +91,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
if not new_separators:
|
||||
final_chunks.append(s)
|
||||
else:
|
||||
s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
|
||||
s = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
|
||||
other_info = self._split_text(s, new_separators)
|
||||
final_chunks.extend(other_info)
|
||||
if _good_splits:
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue