enhance splitter

enhance splitter
This commit is contained in:
wvivi2023 2023-11-29 13:25:44 +08:00
parent 9ba2120129
commit dce1d16e29
12 changed files with 108 additions and 25 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -1,2 +1,3 @@
from .mypdfloader import RapidOCRPDFLoader
from .myimgloader import RapidOCRLoader
from .customiedpdfloader import CustomizedPDFLoader

View File

@ -0,0 +1,70 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
import tqdm
class CustomizedPDFLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def pdf2text(filepath):
import PyPDF2
mypdf = open(filepath,mode='rb')
doc = PyPDF2.PdfReader(mypdf)
page_count = len(doc.pages)
print(f"文档页数:{page_count}")
i = 0
resp = ""
b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0")
while i < page_count:
# 更新描述
b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1))
# 立即显示进度条更新结果
b_unit.refresh()
first_page = doc.pages[i]
text= first_page.extract_text()
resp += text + "\n"
i = i+1
return resp
# def pdf2text(filepath):
# import fitz # pyMuPDF里面的fitz包不要与pip install fitz混淆
# from rapidocr_onnxruntime import RapidOCR
# import numpy as np
# ocr = RapidOCR()
# doc = fitz.open(filepath)
# resp = ""
# b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
# for i, page in enumerate(doc):
# # 更新描述
# b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
# # 立即显示进度条更新结果
# b_unit.refresh()
# # TODO: 依据文本与图片顺序调整处理方式
# text = page.get_text("")
# resp += text + "\n"
# img_list = page.get_images()
# for img in img_list:
# pix = fitz.Pixmap(doc, img[0])
# img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
# result, _ = ocr(img_array)
# if result:
# ocr_result = [line[1] for line in result]
# resp += "\n".join(ocr_result)
# # 更新进度
# b_unit.update(1)
# return resp
text = pdf2text(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(text=text, **self.unstructured_kwargs)
if __name__ == "__main__":
loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf")
docs = loader.load()
print(docs)

View File

@ -1,7 +1,7 @@
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
import tqdm
import os
class RapidOCRPDFLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
@ -13,29 +13,40 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
doc = fitz.open(filepath)
resp = ""
file_name_without_extension, file_extension = os.path.splitext(filepath)
b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
for i, page in enumerate(doc):
outputfile = file_name_without_extension + "_scan.txt"
# 打开文件以写入模式
with open(outputfile, 'w') as file:
# 更新描述
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
# 立即显示进度条更新结果
b_unit.refresh()
# TODO: 依据文本与图片顺序调整处理方式
text = page.get_text("")
resp += text + "\n"
for i, page in enumerate(doc):
img_list = page.get_images()
for img in img_list:
pix = fitz.Pixmap(doc, img[0])
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
result, _ = ocr(img_array)
if result:
ocr_result = [line[1] for line in result]
resp += "\n".join(ocr_result)
# 更新描述
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
# 立即显示进度条更新结果
b_unit.refresh()
# TODO: 依据文本与图片顺序调整处理方式
text = page.get_text("")
file.write(f"\n**********文字,页码:{i}")
file.write(text)
resp += text + "\n"
# 更新进度
b_unit.update(1)
return resp
img_list = page.get_images()
for img in img_list:
pix = fitz.Pixmap(doc, img[0])
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
result, _ = ocr(img_array)
if result:
ocr_result = [line[1] for line in result]
file.write(f"\n*****图片****,页码:{i}")
file.write(ocr_result)
resp += "\n".join(ocr_result)
# 更新进度
b_unit.update(1)
return resp
text = pdf2text(self.file_path)
from unstructured.partition.text import partition_text

BIN
server/.DS_Store vendored

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -73,7 +73,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
"UnstructuredMarkdownLoader": ['.md'],
"CustomJSONLoader": [".json"],
"CSVLoader": [".csv"],
"RapidOCRPDFLoader": [".pdf"],
"CustomizedPDFLoader": [".pdf"],
"RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'],
"UnstructuredFileLoader": ['.eml', '.msg', '.rst',
'.rtf', '.txt', '.xml',
@ -152,7 +152,7 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri
根据loader_name和文件路径或内容返回文档加载器
'''
try:
if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader"]:
if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader", "CustomizedPDFLoader"]:
document_loaders_module = importlib.import_module('document_loaders')
else:
document_loaders_module = importlib.import_module('langchain.document_loaders')

View File

@ -37,6 +37,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
"""Create a new TextSplitter."""
super().__init__(keep_separator=keep_separator, **kwargs)
self._separators = separators or [
SPLIT_SEPARATOE,
SPLIT_SEPARATOE,
#"\n\n",
#"\n",
@ -54,7 +55,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
# Get appropriate separator to use
separator = separators[-1]
new_separators = [SPLIT_SEPARATOE]
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2这样的章和节来分块
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2 这样的章和节来分块
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
@ -88,7 +89,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if not new_separators:
final_chunks.append(s)
else:
text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
other_info = self._split_text(s, new_separators)
final_chunks.extend(other_info)
if _good_splits:

BIN
webui_pages/.DS_Store vendored

Binary file not shown.