From 5c8610f47fa373af198f2592ab52650e98d5c166 Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Thu, 28 Dec 2023 10:52:52 +0800 Subject: [PATCH] enhance 3rd catalog content --- .DS_Store | Bin 6148 -> 10244 bytes server/knowledge_base/utils.py | 3 +- text_splitter/__init__.py | 3 +- .../chinese_recursive_text_splitter.py | 77 ++++++++---- text_splitter/zh_second_title_enhance.py | 113 ++++++++++++++++++ 5 files changed, 170 insertions(+), 26 deletions(-) create mode 100644 text_splitter/zh_second_title_enhance.py diff --git a/.DS_Store b/.DS_Store index 23badb7dba054fdf05b0fea42dccabb9962d5bd0..5276a037243498ee07863db78023769616a2cc84 100644 GIT binary patch literal 10244 zcmeHMU2GIp6h3EK=$+wjTFcLXOIIscN(69EVV0HcdbbuLv`D7>FvQ;Otk6p~2x00s<$lPWl4DRau` zKoKAiAP^uBAP^uBAaE@pK;LX$*c>5)IzS*mAV6Re0lq(k>Ebf(%LyU9t%E8r1t3{Y zYOl~e^qx+3 zVQ)BKT*#mf5C{;MiU7~uDp3NY=7^F zUFda_ZqDgUX7}<|pKY6|;(apJZ|H@DTHR$^Zql&ytY=_kbQ-d6z_2oQq0`P;8OO8j z6&*}wazb4)J|2n08p6@`V-4Z)NHo&a5RS##j~$cQ!rHYPb|ns*qn7=oa0LC$0OplA z_S96b^1WHhw5?(4;a2nEFnMf40Zh;#L+2JcBp?Mza6lI)7#;#=t~vtR_;_P9+7NDR zjd>8Xv=JP!NNWiM+(jzWLU5cqXLPazklEaE$U>gi*9!obIaT8DJaEa?BUC5I^)cds zLHAsDsr&C;_4QvoKHl8c5N?e|`0}r7rT}PdX)9U%C%na^ZtRfZRSQTC!toR zlV&<+CSAS5G<%H)bZ5}jkGTA9$g#6-mp5?GAX~0WsA|s9cj;tcKt=2Q!}CTo;W z&s`Y0b>-@&==!_5w(Qt>Zr=O_s#2v?59AHUNSXTXf{}4Y_9UIOZec!)L=%QX%e zdRDH)tlVcMNA+`y7B5k?rP|HYSuwSi4dznD{W-%uDxOiJ-6E@}*uD(C*FAQI&zSLg zv>I&%V|%&2E2~p2)JmOJFRMw`=a6e+ibiFvtU7YvVcPM8X|vMG)Qr5_Or~^GsM?iv zOx-RIOa~S#;knO{0Y|th_-RJT17` z=hd$nW3};mu`-u|QX6j)TA6;rX|RTVux){T6h>ok6pq8oa1!2xci}_$48DT%@ICwh zKfWp-3{6tcyOFk$m^qHKfB!py-+>R{V>k=vh`_(W@9-yFB+|~uY9j1PtRvDkU?Y(>hU;+y zZp3cfj9YOVQMeELaS-?55T;SbVH`n&2<&1G^Y|b>L?nKcXnX>n#%J(Zd=5|IEBGp& z!q@N(JcFN<(9@@Dk)-(P%=9ez;uXyT86XfK5FijBaAPA-CfpD6^Z)MY|NpxwKVir{DuSAM4T?gsng^!yM(leoo58-<6IG!5(IKIx;`T3#OANO?= gLVD(8=b4ifr}_W;p8=)WNs#{s`9CKl+&KCFzg0?H>;M1& delta 108 zcmZn(XfcprU|?W$DortDU=RQ@Ie-{Mvv5r;6q~50$jGuWU^g=(%VZvb?~_@D|7?CC u$-%Ukor6P=8K?>f1h|2OD@en}!tczJ`BgkY+8LN2MuALc*c{I@hZz7E$`VEZ diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index dc0e6ae..6a7ebd7 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -13,7 +13,7 @@ from configs import ( TEXT_SPLITTER_NAME, ) import importlib -from text_splitter import zh_title_enhance as func_zh_title_enhance +from text_splitter import zh_second_title_enhance import langchain.document_loaders from langchain.document_loaders.word_document import Docx2txtLoader from langchain.docstore.document import Document @@ -355,6 +355,7 @@ class KnowledgeFile: i = i+1 if zh_title_enhance: + docs = zh_second_title_enhance(docs) docs = customize_zh_title_enhance(docs) i = 1 outputfile = file_name_without_extension + "_split.txt" diff --git a/text_splitter/__init__.py b/text_splitter/__init__.py index dc06412..f88bafb 100644 --- a/text_splitter/__init__.py +++ b/text_splitter/__init__.py @@ -1,4 +1,5 @@ from .chinese_text_splitter import ChineseTextSplitter from .ali_text_splitter import AliTextSplitter from .zh_title_enhance import zh_title_enhance -from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter \ No newline at end of file +from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter +from .zh_second_title_enhance import zh_second_title_enhance \ No newline at end of file diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 81aaeca..7c66321 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -2,7 +2,7 @@ import re from typing import List, Optional, Any from langchain.text_splitter import RecursiveCharacterTextSplitter import logging -import PyPDF2 +#import PyPDF2 logger = logging.getLogger(__name__) @@ -62,15 +62,15 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): if self.is_recursive == False: text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块 text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的 - text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过“手工分段**” + text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换 text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章 - text = re.sub(r'(\n+(? bool: + # 文本长度为0的话或长度大于25,肯定不是title + if len(text) == 0 and len (text)>= 25: + print("Not a title. Text is empty or longer than 25.") + return "" + + splitlines = text.splitlines() + first_line = splitlines[0] + # 文本中有标点符号,就不是title + ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + if ENDS_IN_PUNCT_RE.search(first_line) is not None: + return "" + + FIRST_TITLE = r'((? str: + # 文本长度为0的话,肯定不是title + if len(text) == 0 and len (text)>= 25: + print("Not a title. Text is empty or longer than 25.") + return "" + + splitlines = text.splitlines() + first_line = splitlines[0] + # 文本中有标点符号,就不是title + ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + if ENDS_IN_PUNCT_RE.search(first_line) is not None: + return "" + + Second_TITLE = r'((?1: + Second_line = splitlines[1] + if TITLE_PUNCT_RE.search(Second_line) is not None: + return Second_line + return "" + +#judge if it is 3rd level content +def is_third_level_content( + text: str, +) -> bool: + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty.") + return False + + splitlines = text.splitlines() + first_line = splitlines[0] + + Third_TITLE = r'((? Document: + title = None + if len(docs) > 0: + for doc in docs: + second_title = get_second_level_title(doc.page_content) + if second_title: + title = second_title + elif title: + temp_third_content = is_third_level_content(doc.page_content) + if temp_third_content: + doc.page_content = f"{title} {doc.page_content}" + else: + title = None + return docs + else: + print("文件不存在") + + +if __name__ == "__main__": + str = """6 进出等电位 +6.1 直线塔进出等电位 +6.1.1 对于直线塔, 作业人员不得从横担或绝缘子串垂直进出等电位, 可采用吊篮(吊椅、吊梯) 法、 绝缘软梯法等方式进出等电位。 +6.1.2 等电位作业人员进出等电位时与接地体及带电体的各电气间隙距离(包括安全距离、组合间隙) 均应满足表 1 、3 要求。 +6.1.3 吊篮(吊椅、吊梯)必须用吊拉绳索稳固悬吊; 吊篮(吊椅、吊梯)的移动速度必须用绝缘滑 车组严格控制, 做到均匀、慢速; 固定吊拉绳索的长度, 应准确计算或实际丈量, 保证等电位作业人员 即将进入等电位时人体最高部位不超过导线侧均压环。 +6.2 耐张塔进出等电位 +6.2.1 在耐张塔进出等电位时,作业人员可采用沿耐张绝缘子串方法或其它方法进出等电位。 +6.2.2 等电位作业人员沿绝缘子串移动时, 手与脚的位置必须保持对应一致, 且人体和工具短接的绝 缘子片数应符合 5.2.2 的要求。 +6.2.3 等电位作业人员所系安全带,应绑在手扶的绝缘子串上,并与等电位作业人员同步移动。 +6.2.4 等电位作业人员在进出等电位时,应在移动至距离带电体 3 片绝缘子时进行电位转移,方可进 行后续操作。 +6.2.5 带电作业人员与接地体及带电体的各电气间隙距离(包括安全距离、组合间隙)和经人体或工 具短接后的良好绝缘子片数均应满足表 4 要求,否则不能沿耐张绝缘子串进出等电位。 +7 作业中的注意事项 +7.1 等电位作业人员在带电作业过程中时,应避免身体动作幅度过大。 +7.2 等电位作业人员与地电位作业人员之间传递物品应采用绝缘工具,绝缘工具的有效长度,应满足 表 2 的规定。 +7.3 屏蔽服装应无破损和孔洞, 各部分应连接良好、可靠。发现破损和毛刺时应送有资质的试验单位 进行屏蔽服装电阻和屏蔽效率测量,测量结果满足本标准 5.3.1 条的要求后,方可使用。 +7.4 绝缘工具在使用前, 应使用 2500V 及以上兆欧表进行分段检测(电极宽 2cm,极间宽 2cm),阻值 不低于 700MΩ。""" + title = is_third_level_content(str) + print(title) + title = get_second_level_title(str) + print(title) + #zh_second_title_enhance() \ No newline at end of file