enhance pdf loader
This commit is contained in:
parent
6b82620033
commit
7b9369e625
|
|
@ -2,8 +2,10 @@ from typing import List
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from configs import PDF_OCR_THRESHOLD
|
from configs import PDF_OCR_THRESHOLD
|
||||||
from document_loaders.ocr import get_ocr
|
from document_loaders.ocr import get_ocr
|
||||||
|
#PDF_OCR_THRESHOLD= (0.6,0.6)
|
||||||
|
#from ocr import get_ocr
|
||||||
import tqdm
|
import tqdm
|
||||||
|
import re
|
||||||
|
|
||||||
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
|
|
@ -18,10 +20,13 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
for i, page in enumerate(doc):
|
for i, page in enumerate(doc):
|
||||||
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
|
b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
|
||||||
b_unit.refresh()
|
b_unit.refresh()
|
||||||
|
print(f"****page:{i+1}****")
|
||||||
text = page.get_text("")
|
text = page.get_text("")
|
||||||
resp += text + "\n"
|
text_lines = text.strip().split("\n")
|
||||||
|
#print(f"文字内容:{text_lines}")
|
||||||
|
|
||||||
img_list = page.get_image_info(xrefs=True)
|
img_list = page.get_image_info(xrefs=True)
|
||||||
|
ocr_result = []
|
||||||
for img in img_list:
|
for img in img_list:
|
||||||
if xref := img.get("xref"):
|
if xref := img.get("xref"):
|
||||||
bbox = img["bbox"]
|
bbox = img["bbox"]
|
||||||
|
|
@ -34,10 +39,25 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
result, _ = ocr(img_array)
|
result, _ = ocr(img_array)
|
||||||
if result:
|
if result:
|
||||||
ocr_result = [line[1] for line in result]
|
ocr_result = [line[1] for line in result]
|
||||||
resp += "\n".join(ocr_result)
|
#print(f"图片内容:{ocr_result}")
|
||||||
|
#resp += "\n".join(ocr_result)
|
||||||
|
|
||||||
|
if (len(ocr_result)>0):
|
||||||
|
resp += "\n".join(ocr_result)
|
||||||
|
else:
|
||||||
|
if text_lines:
|
||||||
|
# 假设页码在最后一行
|
||||||
|
if text_lines[-1].isdigit():
|
||||||
|
text = "\n".join(text_lines[:-1])
|
||||||
|
print(f"******去除了页码")
|
||||||
|
resp += text + "\n"
|
||||||
|
|
||||||
# 更新进度
|
# 更新进度
|
||||||
b_unit.update(1)
|
b_unit.update(1)
|
||||||
|
|
||||||
|
resp = re.sub(r'((?<!.)\d+(?!\.|[a-zA-Z0-9]))', r"\1 ", resp)
|
||||||
|
resp = re.sub(r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+(?!\.|[a-zA-Z0-9]))', r"\1 ", resp)
|
||||||
|
resp = re.sub(r'((?<!.)[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+(?!\.|[a-zA-Z0-9]))', r"\1 ", resp)
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
text = pdf2text(self.file_path)
|
text = pdf2text(self.file_path)
|
||||||
|
|
@ -46,6 +66,9 @@ class RapidOCRPDFLoader(UnstructuredFileLoader):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
loader = RapidOCRPDFLoader(file_path="../tests/samples/ocr_test.pdf")
|
loader = RapidOCRPDFLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/图片版pdf数据/变电站集中监控验收技术导则.pdf")
|
||||||
|
#loader = RapidOCRPDFLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/原PDF文档/设备/AQ80012007.pdf")
|
||||||
|
#loader = RapidOCRPDFLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/原PDF文档/设备/DL4081991.pdf")
|
||||||
|
#loader = RapidOCRPDFLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/原PDF文档/设备/AQ80032007.pdf")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
print(docs)
|
print(docs)
|
||||||
|
|
|
||||||
|
|
@ -155,23 +155,23 @@ def is_fourth_level_content(
|
||||||
#给四级被分开的内容 增加三级标题
|
#给四级被分开的内容 增加三级标题
|
||||||
def zh_third_title_enhance(docs: Document) -> Document:
|
def zh_third_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
print(f"zh_third_title_enhance ....")
|
#print(f"zh_third_title_enhance ....")
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
#print(f"zh_third_title_enhance: {doc}")
|
print(f"zh_third_title_enhance: {doc}")
|
||||||
third_title = get_third_level_title(doc.page_content)
|
third_title = get_third_level_title(doc.page_content)
|
||||||
if third_title:
|
if third_title:
|
||||||
title = third_title
|
title = third_title
|
||||||
#print(f"title: {title}")
|
print(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
#print(f"title is not none")
|
print(f"title is not none")
|
||||||
temp_fourth_content = is_fourth_level_content(doc.page_content)
|
temp_fourth_content = is_fourth_level_content(doc.page_content)
|
||||||
if temp_fourth_content:
|
if temp_fourth_content:
|
||||||
#print(f"is_fourth_level_content : {temp_fourth_content}")
|
#print(f"is_fourth_level_content : {temp_fourth_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
#print(f"final title: {title}")
|
print(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_third_title_enhance 文件不存在")
|
print("zh_third_title_enhance 文件不存在")
|
||||||
|
|
@ -181,20 +181,20 @@ def zh_second_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
#print(f"zh_second_title_enhance: {doc}")
|
print(f"zh_second_title_enhance: {doc}")
|
||||||
second_title = get_second_level_title(doc.page_content)
|
second_title = get_second_level_title(doc.page_content)
|
||||||
if second_title:
|
if second_title:
|
||||||
title = second_title
|
title = second_title
|
||||||
#print(f"title: {title}")
|
print(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
#print(f"title is not none")
|
print(f"title is not none")
|
||||||
temp_third_content = is_third_level_content(doc.page_content)
|
temp_third_content = is_third_level_content(doc.page_content)
|
||||||
if temp_third_content:
|
if temp_third_content:
|
||||||
#print(f"is_third_level_content : {temp_third_content}")
|
print(f"is_third_level_content : {temp_third_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
#print(f"final title: {title}")
|
print(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_second_title_enhance 文件不存在")
|
print("zh_second_title_enhance 文件不存在")
|
||||||
|
|
@ -204,19 +204,19 @@ def zh_first_title_enhance(docs: Document) -> Document:
|
||||||
title = None
|
title = None
|
||||||
if len(docs) > 0:
|
if len(docs) > 0:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
#print(f"zh_first_title_enhance: {doc}")
|
print(f"zh_first_title_enhance: {doc}")
|
||||||
first_title = get_fist_level_title(doc.page_content)
|
first_title = get_fist_level_title(doc.page_content)
|
||||||
if first_title:
|
if first_title:
|
||||||
title = first_title
|
title = first_title
|
||||||
#print(f"title: {title}")
|
print(f"title: {title}")
|
||||||
elif title:
|
elif title:
|
||||||
temp_second_content = is_second_level_content(doc.page_content)
|
temp_second_content = is_second_level_content(doc.page_content)
|
||||||
if temp_second_content:
|
if temp_second_content:
|
||||||
#print(f"is_second_level_content : {temp_second_content}")
|
print(f"is_second_level_content : {temp_second_content}")
|
||||||
doc.page_content = f"{title} {doc.page_content}"
|
doc.page_content = f"{title} {doc.page_content}"
|
||||||
else:
|
else:
|
||||||
title = None
|
title = None
|
||||||
#print(f"final title: {title}")
|
print(f"final title: {title}")
|
||||||
return docs
|
return docs
|
||||||
else:
|
else:
|
||||||
print("zh_first_title_enhance 文件不存在")
|
print("zh_first_title_enhance 文件不存在")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue