From dce1d16e297593cfe5b771c178e0f467df5c5a01 Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Wed, 29 Nov 2023 13:25:44 +0800 Subject: [PATCH] enhance splitter enhance splitter --- .DS_Store | Bin 8196 -> 0 bytes document_loaders/__init__.py | 3 +- document_loaders/customiedpdfloader.py | 70 ++++++++++++++++++ document_loaders/mypdfloader.py | 51 ++++++++----- server/.DS_Store | Bin 6148 -> 0 bytes server/knowledge_base/.DS_Store | Bin 6148 -> 0 bytes server/knowledge_base/kb_service/.DS_Store | Bin 6148 -> 0 bytes .../kb_service/knowledge_base/.DS_Store | Bin 6148 -> 0 bytes .../kb_service/knowledge_base/test/.DS_Store | Bin 6148 -> 0 bytes server/knowledge_base/utils.py | 4 +- .../chinese_recursive_text_splitter.py | 5 +- webui_pages/.DS_Store | Bin 6148 -> 0 bytes 12 files changed, 108 insertions(+), 25 deletions(-) delete mode 100644 .DS_Store create mode 100644 document_loaders/customiedpdfloader.py delete mode 100644 server/.DS_Store delete mode 100644 server/knowledge_base/.DS_Store delete mode 100644 server/knowledge_base/kb_service/.DS_Store delete mode 100644 server/knowledge_base/kb_service/knowledge_base/.DS_Store delete mode 100644 server/knowledge_base/kb_service/knowledge_base/test/.DS_Store delete mode 100644 webui_pages/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 582265c2ea63d2fafafb4ed3cedbad02bb5e5048..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM!EVz)5S?v8<4_gl08+TE#MOwHCM^*cE0iC=2o8WkOiZxqdZXB>D@BofE_@LD z2;alZ?gkZy;VQUY~UmE$TueBXJa~Dxy zK0pi;$igCumY^Wzn}oVv#B{7-uuSOmK9(H{!0bu`5>_q`W8!r z`q6<#mjH-$oYoD;SO@4FY08q}$HOjA7wn<{LHAxt{@C50pU7E6OB9fU~-Vb3gV zg(CFXF;D7n5WYdStN<&pt^n2U4fq5KQuutiepexSzK7E!&hk#@r`W00@9Z|5hO_5< z2)?Ky$Vd4+8;oY}=+|qdl5jO1g>T}h7`E=eP-#Aj(`cp>;&Fr_@88DhSQP^`Psf?g z4UEHSI?Z9Ly;ykNt}A<;r7IWS!QnGk9`<|7Wz%`^=*ja}!;|DRRTqYcunz;+uMqpX z&gv|frHM)>=%K6{ay^BK*{irgt_Mf1>_5FKSIeft@gAJuRaH List: + def pdf2text(filepath): + import PyPDF2 + mypdf = open(filepath,mode='rb') + doc = PyPDF2.PdfReader(mypdf) + page_count = len(doc.pages) + print(f"文档页数:{page_count}") + + i = 0 + resp = "" + b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0") + while i < page_count: + # 更新描述 + b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1)) + # 立即显示进度条更新结果 + b_unit.refresh() + first_page = doc.pages[i] + text= first_page.extract_text() + resp += text + "\n" + i = i+1 + + return resp + + # def pdf2text(filepath): + # import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆 + # from rapidocr_onnxruntime import RapidOCR + # import numpy as np + # ocr = RapidOCR() + # doc = fitz.open(filepath) + # resp = "" + + # b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") + # for i, page in enumerate(doc): + + # # 更新描述 + # b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) + # # 立即显示进度条更新结果 + # b_unit.refresh() + # # TODO: 依据文本与图片顺序调整处理方式 + # text = page.get_text("") + # resp += text + "\n" + + # img_list = page.get_images() + # for img in img_list: + # pix = fitz.Pixmap(doc, img[0]) + # img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) + # result, _ = ocr(img_array) + # if result: + # ocr_result = [line[1] for line in result] + # resp += "\n".join(ocr_result) + + # # 更新进度 + # b_unit.update(1) + # return resp + + text = pdf2text(self.file_path) + from unstructured.partition.text import partition_text + return partition_text(text=text, **self.unstructured_kwargs) + + +if __name__ == "__main__": + loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf") + docs = loader.load() + print(docs) diff --git a/document_loaders/mypdfloader.py b/document_loaders/mypdfloader.py index 6cb7726..e3e05b9 100644 --- a/document_loaders/mypdfloader.py +++ b/document_loaders/mypdfloader.py @@ -1,7 +1,7 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader import tqdm - +import os class RapidOCRPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: @@ -13,29 +13,40 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): doc = fitz.open(filepath) resp = "" + file_name_without_extension, file_extension = os.path.splitext(filepath) + b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") - for i, page in enumerate(doc): + outputfile = file_name_without_extension + "_scan.txt" + # 打开文件以写入模式 + with open(outputfile, 'w') as file: + + for i, page in enumerate(doc): - # 更新描述 - b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) - # 立即显示进度条更新结果 - b_unit.refresh() - # TODO: 依据文本与图片顺序调整处理方式 - text = page.get_text("") - resp += text + "\n" + # 更新描述 + b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) + # 立即显示进度条更新结果 + b_unit.refresh() + # TODO: 依据文本与图片顺序调整处理方式 + text = page.get_text("") + file.write(f"\n**********文字,页码:{i}") + file.write(text) + resp += text + "\n" - img_list = page.get_images() - for img in img_list: - pix = fitz.Pixmap(doc, img[0]) - img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) - result, _ = ocr(img_array) - if result: - ocr_result = [line[1] for line in result] - resp += "\n".join(ocr_result) - # 更新进度 - b_unit.update(1) - return resp + img_list = page.get_images() + for img in img_list: + pix = fitz.Pixmap(doc, img[0]) + img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) + result, _ = ocr(img_array) + if result: + ocr_result = [line[1] for line in result] + file.write(f"\n*****图片****,页码:{i}") + file.write(ocr_result) + resp += "\n".join(ocr_result) + + # 更新进度 + b_unit.update(1) + return resp text = pdf2text(self.file_path) from unstructured.partition.text import partition_text diff --git a/server/.DS_Store b/server/.DS_Store deleted file mode 100644 index 5a32a3ef209805f33c127494e1ac1e34501caf28..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!A{#i5S;}R98ggXRB&90Yb25q0#Pqk3O|4mJropdY=MRGqS(QQD3Y&jKZt%w zzlWLKB_c`b5g{}q?Y!OD8C&zlyXyr2(H})^fCd1#*a)39RR0i~r?wzV)>1*zb+&t7PNyk5y-8D!+k4%^rtJ3iCzHDSd1v?Q$>27<%hWGJpuqh`%O{J^@EbeRd>;H^ zma6O;ZREcJw~&E?5>kjEf@{P7yKZ(jQ?tbGsb zeXvip?|o_EXRz0uEB#MX<5x>hpT8)T8gkdC7VaR9m%m6d+p%@U(k$=?3lJU`+IMy9Hinp<8!8u7CM9*Sn5G^SDBcN$u#SFYD15X3o%mDxZ diff --git a/server/knowledge_base/.DS_Store b/server/knowledge_base/.DS_Store deleted file mode 100644 index 4030beb80b4c55370309521970e67de64fd4f2bd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}xR_5S{|cA~E5h3CAX0F+vOk<7I>R0a1CyzJ)qjXrSJ+s0>adBzcTDHp8rE@DX zXXp(_X~!F!)9OTs*q_cl|1=6Sx3Yd9lA#wR;Xpb>L5N2#&Y~m`Sx1bLAeFw3GFT<6 z>sMp1V^=&qh6ts(79ZpbiqNFV>zKxyoo9WZJZ{Ew#Hl`?x2{5 NfTY0;hJjyY;03pgkxT#p diff --git a/server/knowledge_base/kb_service/knowledge_base/.DS_Store b/server/knowledge_base/kb_service/knowledge_base/.DS_Store deleted file mode 100644 index 32904a3012664ad46131f43e27b1c1f4b3ed81af..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5ZAHtJ2 zPricBU?EGwYzl7Z^A%xcZeua<%A%uZS%-PU^Msmip0!ZZ; zUK)43PAlUCA? z>#g3o?41Y^dLz5#od)f+Sy(?5QPK*cc25-qej8FQ&VtAnX_p;0lStuvQ?h1o1dm{h*Aw1U+-uS4 zj)If1j%YN@x5{3h-(Q~6kAvyB|6{y+JAdqZw-<#Xx`FXV^PHFhn#ce$fDA000kb!; z{K98~6F>%#fxpIp+#fioM8{yJQEeU2s4D=V1#Tr^%e#cE;RYRpnMR0!P+bbBOR2eH zP+bmwZt@(1nMPessJVPl^JHpnC{#Ti#^)-XP)8%S$N(}h&A^iG7UcPVH2?lTU4%Vk z02%mK3{b1#HtMh>bGFVbPM)^uHKG&+7G1M=)Lq(y{0IWlv8!hw-zr|)~iuvB`biDYUmonaZ4W7l?9M0VH= zN5fika4CCdLio*+CiIa(Iwej}>i{c5#4dl2{{=)eloUCjJ6G?4*h02x>* z1Lodgxs|ShlOqGjz+YrQ-VYp9qGd4FsICrZR0#l>0=E*dWi26dxIxQctPw^)s7?jc zsnnDhRHuWVn>fp0tWl>EYKjkPR;H#xp=x!QpQ~^}EsfYB1IWNU14-S@%lH4}^ZI{2 z346!@GVre$psBiDufdkg+q$qh`POpK8&D}SF4p*-0)~2uAyz)c1yCj6=h6VQ48|J4 Q1A>196b;xQ13${Z8%_9ZH2?qr diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index f25a706..e91587b 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -73,7 +73,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], "UnstructuredMarkdownLoader": ['.md'], "CustomJSONLoader": [".json"], "CSVLoader": [".csv"], - "RapidOCRPDFLoader": [".pdf"], + "CustomizedPDFLoader": [".pdf"], "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'], "UnstructuredFileLoader": ['.eml', '.msg', '.rst', '.rtf', '.txt', '.xml', @@ -152,7 +152,7 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri 根据loader_name和文件路径或内容返回文档加载器。 ''' try: - if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader"]: + if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader", "CustomizedPDFLoader"]: document_loaders_module = importlib.import_module('document_loaders') else: document_loaders_module = importlib.import_module('langchain.document_loaders') diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 480d6ca..b163504 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -37,6 +37,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): """Create a new TextSplitter.""" super().__init__(keep_separator=keep_separator, **kwargs) self._separators = separators or [ + SPLIT_SEPARATOE, SPLIT_SEPARATOE, #"\n\n", #"\n", @@ -54,7 +55,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): # Get appropriate separator to use separator = separators[-1] new_separators = [SPLIT_SEPARATOE] - text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2这样的章和节来分块 + text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2 这样的章和节来分块 text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条 text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条 @@ -88,7 +89,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): if not new_separators: final_chunks.append(s) else: - text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块 + s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块 other_info = self._split_text(s, new_separators) final_chunks.extend(other_info) if _good_splits: diff --git a/webui_pages/.DS_Store b/webui_pages/.DS_Store deleted file mode 100644 index 2ff7808f3a39b3c2fa3e26edd478776fa2701eb8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyJ`bL3>+mc4$`-& znmulc(+Pkr&f6DY4q%`=;?u*_{M>zF50x<@op&6u!x?WMtJ7gpeLLaY0b9IdgZ)qb z{&?8sy|; zaUHV+v3Y{n3&%ueXqHrBQmsY|OFHwd>U!arm~>bTA68GcnoumB&ih-G!+N5k6p#Yf z3Owd^?*0Fj{>%J-P0~&ZNP&N)fX$Yx<$|wNy>;?(-fJ6uP4}8Fx*OL)VTg82jCRb8 fx8vt1%DU!jp7+8rG3d+(ov5Dy*F`1;{#t