func:pdf解析功能
This commit is contained in:
parent
9e8215d888
commit
14ac17ef63
Binary file not shown.
|
|
@ -0,0 +1,108 @@
|
|||
aiohappyeyeballs==2.6.1
|
||||
aiohttp==3.12.15
|
||||
aiosignal==1.4.0
|
||||
annotated-types==0.7.0
|
||||
anyio==4.10.0
|
||||
astor==0.8.1
|
||||
async-timeout==4.0.3
|
||||
attrs==25.3.0
|
||||
beautifulsoup4==4.13.4
|
||||
cachetools==6.1.0
|
||||
certifi==2025.8.3
|
||||
chardet==5.2.0
|
||||
charset-normalizer==3.4.2
|
||||
colorlog==6.9.0
|
||||
cssselect==1.3.0
|
||||
cssutils==2.11.1
|
||||
dataclasses-json==0.6.7
|
||||
decorator==5.2.1
|
||||
distro==1.9.0
|
||||
einops==0.8.1
|
||||
et_xmlfile==2.0.0
|
||||
exceptiongroup==1.3.0
|
||||
filelock==3.18.0
|
||||
frozenlist==1.7.0
|
||||
fsspec==2025.7.0
|
||||
ftfy==6.3.1
|
||||
GPUtil==1.4.0
|
||||
greenlet==3.2.3
|
||||
h11==0.16.0
|
||||
hf-xet==1.1.5
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
httpx-sse==0.4.1
|
||||
huggingface-hub==0.34.3
|
||||
idna==3.10
|
||||
imagesize==1.4.1
|
||||
Jinja2==3.1.6
|
||||
jiter==0.10.0
|
||||
joblib==1.5.1
|
||||
jsonpatch==1.33
|
||||
jsonpointer==3.0.0
|
||||
langchain==0.3.27
|
||||
langchain-community==0.3.27
|
||||
langchain-core==0.3.72
|
||||
langchain-openai==0.3.28
|
||||
langchain-text-splitters==0.3.9
|
||||
langsmith==0.4.11
|
||||
lxml==6.0.0
|
||||
MarkupSafe==3.0.2
|
||||
marshmallow==3.26.1
|
||||
more-itertools==10.7.0
|
||||
multidict==6.6.3
|
||||
mypy_extensions==1.1.0
|
||||
networkx==3.4.2
|
||||
numpy==2.2.6
|
||||
openai==1.98.0
|
||||
opencv-contrib-python==4.10.0.84
|
||||
openpyxl==3.1.5
|
||||
opt-einsum==3.3.0
|
||||
orjson==3.11.1
|
||||
packaging==25.0
|
||||
paddleocr==3.1.0
|
||||
paddlepaddle==3.1.0
|
||||
paddlepaddle-gpu==2.6.2
|
||||
paddlex==3.1.3
|
||||
pandas==2.3.1
|
||||
pillow==11.3.0
|
||||
premailer==3.10.0
|
||||
prettytable==3.16.0
|
||||
propcache==0.3.2
|
||||
protobuf==6.31.1
|
||||
py-cpuinfo==9.0.0
|
||||
pyclipper==1.3.0.post6
|
||||
pydantic==2.11.7
|
||||
pydantic-settings==2.10.1
|
||||
pydantic_core==2.33.2
|
||||
PyMuPDF==1.26.3
|
||||
pypdfium2==4.30.0
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.1.1
|
||||
pytz==2025.2
|
||||
PyYAML==6.0.2
|
||||
regex==2025.7.34
|
||||
requests==2.32.4
|
||||
requests-toolbelt==1.0.0
|
||||
ruamel.yaml==0.18.14
|
||||
ruamel.yaml.clib==0.2.12
|
||||
scikit-learn==1.7.1
|
||||
scipy==1.15.3
|
||||
shapely==2.1.1
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.7
|
||||
SQLAlchemy==2.0.42
|
||||
tenacity==9.1.2
|
||||
threadpoolctl==3.6.0
|
||||
tiktoken==0.9.0
|
||||
tokenizers==0.21.4
|
||||
tqdm==4.67.1
|
||||
typing-inspect==0.9.0
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.14.1
|
||||
tzdata==2025.2
|
||||
ujson==5.10.0
|
||||
urllib3==2.5.0
|
||||
wcwidth==0.2.13
|
||||
yarl==1.20.1
|
||||
zstandard==0.23.0
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
# find_library_path.py
|
||||
import fitz
|
||||
import os
|
||||
|
||||
try:
|
||||
# __file__ 属性会告诉我们 fitz 模块的 __init__.py 文件在哪里
|
||||
fitz_path = fitz.__file__
|
||||
# os.path.dirname() 会给出该文件所在的目录
|
||||
library_directory = os.path.dirname(fitz_path)
|
||||
|
||||
print("--- PyMuPDF Library Location ---")
|
||||
print(f"✅ PyMuPDF (fitz) is installed at: {library_directory}")
|
||||
print("\n下一步:请您用 VS Code 的文件浏览器打开上面的路径。")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Could not locate the library. Error: {e}")
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
import fitz # PyMuPDF
|
||||
from paddleocr import PaddleOCR
|
||||
from PIL import Image
|
||||
import io
|
||||
import numpy as np
|
||||
from typing import List, Dict, Any
|
||||
import traceback
|
||||
import json
|
||||
import traceback
|
||||
import os
|
||||
|
||||
class PaddlePdfProcessor:
|
||||
"""
|
||||
一个基于 PaddleOCR 的健壮PDF处理器。
|
||||
(最终版:适配 PyMuPDF, PaddlePaddle 3.x, 和 PaddleOCR 3.x API)
|
||||
"""
|
||||
|
||||
def __init__(self, pdf_path: str):
|
||||
self.pdf_path = pdf_path
|
||||
self.doc = fitz.open(pdf_path)
|
||||
print("正在初始化 PaddleOCR 模型...")
|
||||
self.ocr_engine = PaddleOCR(use_textline_orientation=True, lang='ch')
|
||||
print("PaddleOCR 初始化完成。")
|
||||
|
||||
def process_page(self, page_num: int) -> List[Dict[str, Any]]:
|
||||
page = self.doc[page_num - 1]
|
||||
pix = page.get_pixmap(dpi=200) # type: ignore
|
||||
img_bytes = pix.tobytes("png")
|
||||
image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
||||
img_array = np.array(image)
|
||||
|
||||
result = self.ocr_engine.predict(input=img_array)
|
||||
# result[0] 是一个 OCRResult 对象
|
||||
structured_blocks = self._parse_paddle_result(result[0] if result else None, page_num)
|
||||
return structured_blocks
|
||||
|
||||
def _parse_paddle_result(self, ocr_result, page_num: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
解析 PaddleOCR 3.x 返回的 OCRResult 对象。
|
||||
"""
|
||||
blocks = []
|
||||
# 如果 ocr_result 为 None 或为空,直接返回
|
||||
if not ocr_result:
|
||||
return blocks
|
||||
|
||||
# 从 OCRResult 对象中,通过键名提取三个平行的列表
|
||||
boxes = ocr_result['dt_polys']
|
||||
texts = ocr_result['rec_texts']
|
||||
scores = ocr_result['rec_scores']
|
||||
|
||||
# 使用 zip() 函数同时遍历这三个列表
|
||||
for box, text, score in zip(boxes, texts, scores):
|
||||
# 将多边形坐标(box)转换为简单的矩形边界框(bbox)
|
||||
# box 是一个 numpy 数组,形状类似 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
|
||||
x_coords = [p[0] for p in box]
|
||||
y_coords = [p[1] for p in box]
|
||||
bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
|
||||
|
||||
blocks.append({
|
||||
"type": "text",
|
||||
"content": text,
|
||||
"bbox": bbox,
|
||||
"page_num": page_num,
|
||||
"confidence": score
|
||||
})
|
||||
|
||||
# 按从上到下、从左到右的阅读顺序对文本块进行排序
|
||||
return sorted(blocks, key=lambda b: (b['bbox'][1], b['bbox'][0]))
|
||||
|
||||
def process_all_pages(self) -> List[Dict[str, Any]]:
|
||||
all_content_blocks = []
|
||||
for i in range(len(self.doc)):
|
||||
page_number = i + 1
|
||||
print(f"--- 正在处理第 {page_number}/{len(self.doc)} 页 ---")
|
||||
blocks = self.process_page(page_number)
|
||||
all_content_blocks.extend(blocks)
|
||||
return all_content_blocks
|
||||
|
||||
def close(self):
|
||||
self.doc.close()
|
||||
|
||||
# --- 主程序入口 ---
|
||||
# --- 主程序入口 (包含保存和加载逻辑) ---
|
||||
if __name__ == '__main__':
|
||||
# 1. 定义输入和输出文件路径
|
||||
pdf_path = "../document/sample_bidding_document.pdf"
|
||||
# 我们将把解析结果保存到这个JSON文件中
|
||||
parsed_output_path = "parsed_output.json"
|
||||
|
||||
all_blocks = []
|
||||
|
||||
# 2. 检查解析结果是否已存在
|
||||
if os.path.exists(parsed_output_path):
|
||||
print(f"✅ 找到了已保存的解析结果 '{parsed_output_path}',直接加载...")
|
||||
with open(parsed_output_path, 'r', encoding='utf-8') as f:
|
||||
all_blocks = json.load(f)
|
||||
else:
|
||||
print(f"未找到已保存的结果,启动完整的PDF解析流程...")
|
||||
print(f"准备处理文件: {pdf_path}")
|
||||
|
||||
try:
|
||||
# 启动耗时的解析过程
|
||||
processor = PaddlePdfProcessor(pdf_path)
|
||||
all_blocks = processor.process_all_pages()
|
||||
processor.close()
|
||||
|
||||
# 3. 将新解析的结果保存到文件中,供下次使用
|
||||
print(f"\n💾 正在将解析结果保存到 '{parsed_output_path}'...")
|
||||
with open(parsed_output_path, 'w', encoding='utf-8') as f:
|
||||
# indent=4 让JSON文件格式化,更易读
|
||||
# ensure_ascii=False 确保中文字符能被正确写入
|
||||
json.dump(all_blocks, f, indent=4, ensure_ascii=False)
|
||||
print("保存成功!")
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"❌ 错误: 文件 '{pdf_path}' 未找到。")
|
||||
except Exception as e:
|
||||
print(f"❌ 程序在解析过程中发生严重错误: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
# 4. 无论数据是加载的还是新生成的,都可以在这里进行后续处理
|
||||
if all_blocks:
|
||||
print("\n" + "="*50)
|
||||
print(f"数据加载/处理完成,共获得 {len(all_blocks)} 个文本块。")
|
||||
print("="*50)
|
||||
|
||||
print("\n--- 处理结果预览 (前 5 个文本块) ---")
|
||||
for block in all_blocks[:5]:
|
||||
# 注意:json会把元组(tuple)存成列表(list),所以bbox现在是列表
|
||||
print(f"P{block['page_num']}: {block['content']} (置信度: {block['confidence']:.4f})")
|
||||
|
||||
# 在这里调用您后续的分析函数,例如段落合并
|
||||
# print("\n--- 开始进行段落合并 ---")
|
||||
# paragraphs = assemble_paragraphs(all_blocks)
|
||||
# ...
|
||||
else:
|
||||
print("未能获取任何文本块,程序结束。")
|
||||
Loading…
Reference in New Issue