diff --git a/document_loaders/mypdfloader.py b/document_loaders/mypdfloader.py index faaf63d..e7e2c76 100644 --- a/document_loaders/mypdfloader.py +++ b/document_loaders/mypdfloader.py @@ -2,8 +2,10 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader from configs import PDF_OCR_THRESHOLD from document_loaders.ocr import get_ocr +#PDF_OCR_THRESHOLD= (0.6,0.6) +#from ocr import get_ocr import tqdm - +import re class RapidOCRPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: @@ -18,10 +20,13 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): for i, page in enumerate(doc): b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) b_unit.refresh() + print(f"****page:{i+1}****") text = page.get_text("") - resp += text + "\n" + text_lines = text.strip().split("\n") + #print(f"文字内容:{text_lines}") img_list = page.get_image_info(xrefs=True) + ocr_result = [] for img in img_list: if xref := img.get("xref"): bbox = img["bbox"] @@ -34,10 +39,25 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): result, _ = ocr(img_array) if result: ocr_result = [line[1] for line in result] - resp += "\n".join(ocr_result) + #print(f"图片内容:{ocr_result}") + #resp += "\n".join(ocr_result) + + if (len(ocr_result)>0): + resp += "\n".join(ocr_result) + else: + if text_lines: + # 假设页码在最后一行 + if text_lines[-1].isdigit(): + text = "\n".join(text_lines[:-1]) + print(f"******去除了页码") + resp += text + "\n" # 更新进度 b_unit.update(1) + + resp = re.sub(r'((? Document: title = None - print(f"zh_third_title_enhance ....") + #print(f"zh_third_title_enhance ....") if len(docs) > 0: for doc in docs: - #print(f"zh_third_title_enhance: {doc}") + print(f"zh_third_title_enhance: {doc}") third_title = get_third_level_title(doc.page_content) if third_title: title = third_title - #print(f"title: {title}") + print(f"title: {title}") elif title: - #print(f"title is not none") + print(f"title is not none") temp_fourth_content = is_fourth_level_content(doc.page_content) if temp_fourth_content: #print(f"is_fourth_level_content : {temp_fourth_content}") doc.page_content = f"{title} {doc.page_content}" else: title = None - #print(f"final title: {title}") + print(f"final title: {title}") return docs else: print("zh_third_title_enhance 文件不存在") @@ -181,20 +181,20 @@ def zh_second_title_enhance(docs: Document) -> Document: title = None if len(docs) > 0: for doc in docs: - #print(f"zh_second_title_enhance: {doc}") + print(f"zh_second_title_enhance: {doc}") second_title = get_second_level_title(doc.page_content) if second_title: title = second_title - #print(f"title: {title}") + print(f"title: {title}") elif title: - #print(f"title is not none") + print(f"title is not none") temp_third_content = is_third_level_content(doc.page_content) if temp_third_content: - #print(f"is_third_level_content : {temp_third_content}") + print(f"is_third_level_content : {temp_third_content}") doc.page_content = f"{title} {doc.page_content}" else: title = None - #print(f"final title: {title}") + print(f"final title: {title}") return docs else: print("zh_second_title_enhance 文件不存在") @@ -204,19 +204,19 @@ def zh_first_title_enhance(docs: Document) -> Document: title = None if len(docs) > 0: for doc in docs: - #print(f"zh_first_title_enhance: {doc}") + print(f"zh_first_title_enhance: {doc}") first_title = get_fist_level_title(doc.page_content) if first_title: title = first_title - #print(f"title: {title}") + print(f"title: {title}") elif title: temp_second_content = is_second_level_content(doc.page_content) if temp_second_content: - #print(f"is_second_level_content : {temp_second_content}") + print(f"is_second_level_content : {temp_second_content}") doc.page_content = f"{title} {doc.page_content}" else: title = None - #print(f"final title: {title}") + print(f"final title: {title}") return docs else: print("zh_first_title_enhance 文件不存在")