parent
9ba2120129
commit
dce1d16e29
|
|
@ -1,2 +1,3 @@
|
|||
from .mypdfloader import RapidOCRPDFLoader
|
||||
from .myimgloader import RapidOCRLoader
|
||||
from .customiedpdfloader import CustomizedPDFLoader
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
import tqdm
|
||||
|
||||
|
||||
class CustomizedPDFLoader(UnstructuredFileLoader):
|
||||
def _get_elements(self) -> List:
|
||||
def pdf2text(filepath):
|
||||
import PyPDF2
|
||||
mypdf = open(filepath,mode='rb')
|
||||
doc = PyPDF2.PdfReader(mypdf)
|
||||
page_count = len(doc.pages)
|
||||
print(f"文档页数:{page_count}")
|
||||
|
||||
i = 0
|
||||
resp = ""
|
||||
b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0")
|
||||
while i < page_count:
|
||||
# 更新描述
|
||||
b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1))
|
||||
# 立即显示进度条更新结果
|
||||
b_unit.refresh()
|
||||
first_page = doc.pages[i]
|
||||
text= first_page.extract_text()
|
||||
resp += text + "\n"
|
||||
i = i+1
|
||||
|
||||
return resp
|
||||
|
||||
# def pdf2text(filepath):
|
||||
# import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆
|
||||
# from rapidocr_onnxruntime import RapidOCR
|
||||
# import numpy as np
|
||||
# ocr = RapidOCR()
|
||||
# doc = fitz.open(filepath)
|
||||
# resp = ""
|
||||
|
||||
# b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
|
||||
# for i, page in enumerate(doc):
|
||||
|
||||
# # 更新描述
|
||||
# b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
|
||||
# # 立即显示进度条更新结果
|
||||
# b_unit.refresh()
|
||||
# # TODO: 依据文本与图片顺序调整处理方式
|
||||
# text = page.get_text("")
|
||||
# resp += text + "\n"
|
||||
|
||||
# img_list = page.get_images()
|
||||
# for img in img_list:
|
||||
# pix = fitz.Pixmap(doc, img[0])
|
||||
# img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
|
||||
# result, _ = ocr(img_array)
|
||||
# if result:
|
||||
# ocr_result = [line[1] for line in result]
|
||||
# resp += "\n".join(ocr_result)
|
||||
|
||||
# # 更新进度
|
||||
# b_unit.update(1)
|
||||
# return resp
|
||||
|
||||
text = pdf2text(self.file_path)
|
||||
from unstructured.partition.text import partition_text
|
||||
return partition_text(text=text, **self.unstructured_kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf")
|
||||
docs = loader.load()
|
||||
print(docs)
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
import tqdm
|
||||
|
||||
import os
|
||||
|
||||
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||
def _get_elements(self) -> List:
|
||||
|
|
@ -13,7 +13,13 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
|||
doc = fitz.open(filepath)
|
||||
resp = ""
|
||||
|
||||
file_name_without_extension, file_extension = os.path.splitext(filepath)
|
||||
|
||||
b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
|
||||
outputfile = file_name_without_extension + "_scan.txt"
|
||||
# 打开文件以写入模式
|
||||
with open(outputfile, 'w') as file:
|
||||
|
||||
for i, page in enumerate(doc):
|
||||
|
||||
# 更新描述
|
||||
|
|
@ -22,8 +28,11 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
|||
b_unit.refresh()
|
||||
# TODO: 依据文本与图片顺序调整处理方式
|
||||
text = page.get_text("")
|
||||
file.write(f"\n**********文字,页码:{i}")
|
||||
file.write(text)
|
||||
resp += text + "\n"
|
||||
|
||||
|
||||
img_list = page.get_images()
|
||||
for img in img_list:
|
||||
pix = fitz.Pixmap(doc, img[0])
|
||||
|
|
@ -31,6 +40,8 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
|||
result, _ = ocr(img_array)
|
||||
if result:
|
||||
ocr_result = [line[1] for line in result]
|
||||
file.write(f"\n*****图片****,页码:{i}")
|
||||
file.write(ocr_result)
|
||||
resp += "\n".join(ocr_result)
|
||||
|
||||
# 更新进度
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -73,7 +73,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
|||
"UnstructuredMarkdownLoader": ['.md'],
|
||||
"CustomJSONLoader": [".json"],
|
||||
"CSVLoader": [".csv"],
|
||||
"RapidOCRPDFLoader": [".pdf"],
|
||||
"CustomizedPDFLoader": [".pdf"],
|
||||
"RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'],
|
||||
"UnstructuredFileLoader": ['.eml', '.msg', '.rst',
|
||||
'.rtf', '.txt', '.xml',
|
||||
|
|
@ -152,7 +152,7 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri
|
|||
根据loader_name和文件路径或内容返回文档加载器。
|
||||
'''
|
||||
try:
|
||||
if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader"]:
|
||||
if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader", "CustomizedPDFLoader"]:
|
||||
document_loaders_module = importlib.import_module('document_loaders')
|
||||
else:
|
||||
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
"""Create a new TextSplitter."""
|
||||
super().__init__(keep_separator=keep_separator, **kwargs)
|
||||
self._separators = separators or [
|
||||
SPLIT_SEPARATOE,
|
||||
SPLIT_SEPARATOE,
|
||||
#"\n\n",
|
||||
#"\n",
|
||||
|
|
@ -54,7 +55,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
# Get appropriate separator to use
|
||||
separator = separators[-1]
|
||||
new_separators = [SPLIT_SEPARATOE]
|
||||
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2这样的章和节来分块
|
||||
text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2 这样的章和节来分块
|
||||
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a
|
||||
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||
|
|
@ -88,7 +89,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
if not new_separators:
|
||||
final_chunks.append(s)
|
||||
else:
|
||||
text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
|
||||
s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块
|
||||
other_info = self._split_text(s, new_separators)
|
||||
final_chunks.extend(other_info)
|
||||
if _good_splits:
|
||||
|
|
|
|||
Binary file not shown.
Loading…
Reference in New Issue