diff --git a/document/sample_bidding_document.pdf b/document/sample_bidding_document.pdf new file mode 100644 index 0000000..3c6ad0e Binary files /dev/null and b/document/sample_bidding_document.pdf differ diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..a3d1a9b --- /dev/null +++ b/requirement.txt @@ -0,0 +1,108 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.10.0 +astor==0.8.1 +async-timeout==4.0.3 +attrs==25.3.0 +beautifulsoup4==4.13.4 +cachetools==6.1.0 +certifi==2025.8.3 +chardet==5.2.0 +charset-normalizer==3.4.2 +colorlog==6.9.0 +cssselect==1.3.0 +cssutils==2.11.1 +dataclasses-json==0.6.7 +decorator==5.2.1 +distro==1.9.0 +einops==0.8.1 +et_xmlfile==2.0.0 +exceptiongroup==1.3.0 +filelock==3.18.0 +frozenlist==1.7.0 +fsspec==2025.7.0 +ftfy==6.3.1 +GPUtil==1.4.0 +greenlet==3.2.3 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 +httpx==0.28.1 +httpx-sse==0.4.1 +huggingface-hub==0.34.3 +idna==3.10 +imagesize==1.4.1 +Jinja2==3.1.6 +jiter==0.10.0 +joblib==1.5.1 +jsonpatch==1.33 +jsonpointer==3.0.0 +langchain==0.3.27 +langchain-community==0.3.27 +langchain-core==0.3.72 +langchain-openai==0.3.28 +langchain-text-splitters==0.3.9 +langsmith==0.4.11 +lxml==6.0.0 +MarkupSafe==3.0.2 +marshmallow==3.26.1 +more-itertools==10.7.0 +multidict==6.6.3 +mypy_extensions==1.1.0 +networkx==3.4.2 +numpy==2.2.6 +openai==1.98.0 +opencv-contrib-python==4.10.0.84 +openpyxl==3.1.5 +opt-einsum==3.3.0 +orjson==3.11.1 +packaging==25.0 +paddleocr==3.1.0 +paddlepaddle==3.1.0 +paddlepaddle-gpu==2.6.2 +paddlex==3.1.3 +pandas==2.3.1 +pillow==11.3.0 +premailer==3.10.0 +prettytable==3.16.0 +propcache==0.3.2 +protobuf==6.31.1 +py-cpuinfo==9.0.0 +pyclipper==1.3.0.post6 +pydantic==2.11.7 +pydantic-settings==2.10.1 +pydantic_core==2.33.2 +PyMuPDF==1.26.3 +pypdfium2==4.30.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.1 +pytz==2025.2 +PyYAML==6.0.2 +regex==2025.7.34 +requests==2.32.4 +requests-toolbelt==1.0.0 +ruamel.yaml==0.18.14 +ruamel.yaml.clib==0.2.12 +scikit-learn==1.7.1 +scipy==1.15.3 +shapely==2.1.1 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.7 +SQLAlchemy==2.0.42 +tenacity==9.1.2 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +tokenizers==0.21.4 +tqdm==4.67.1 +typing-inspect==0.9.0 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +tzdata==2025.2 +ujson==5.10.0 +urllib3==2.5.0 +wcwidth==0.2.13 +yarl==1.20.1 +zstandard==0.23.0 diff --git a/script/check_version.py b/script/check_version.py new file mode 100644 index 0000000..7c27d95 --- /dev/null +++ b/script/check_version.py @@ -0,0 +1,16 @@ +# find_library_path.py +import fitz +import os + +try: + # __file__ 属性会告诉我们 fitz 模块的 __init__.py 文件在哪里 + fitz_path = fitz.__file__ + # os.path.dirname() 会给出该文件所在的目录 + library_directory = os.path.dirname(fitz_path) + + print("--- PyMuPDF Library Location ---") + print(f"✅ PyMuPDF (fitz) is installed at: {library_directory}") + print("\n下一步:请您用 VS Code 的文件浏览器打开上面的路径。") + +except Exception as e: + print(f"❌ Could not locate the library. Error: {e}") \ No newline at end of file diff --git a/script/run_main.py b/script/run_main.py new file mode 100644 index 0000000..3a01c44 --- /dev/null +++ b/script/run_main.py @@ -0,0 +1,137 @@ +import fitz # PyMuPDF +from paddleocr import PaddleOCR +from PIL import Image +import io +import numpy as np +from typing import List, Dict, Any +import traceback +import json +import traceback +import os + +class PaddlePdfProcessor: + """ + 一个基于 PaddleOCR 的健壮PDF处理器。 + (最终版:适配 PyMuPDF, PaddlePaddle 3.x, 和 PaddleOCR 3.x API) + """ + + def __init__(self, pdf_path: str): + self.pdf_path = pdf_path + self.doc = fitz.open(pdf_path) + print("正在初始化 PaddleOCR 模型...") + self.ocr_engine = PaddleOCR(use_textline_orientation=True, lang='ch') + print("PaddleOCR 初始化完成。") + + def process_page(self, page_num: int) -> List[Dict[str, Any]]: + page = self.doc[page_num - 1] + pix = page.get_pixmap(dpi=200) # type: ignore + img_bytes = pix.tobytes("png") + image = Image.open(io.BytesIO(img_bytes)).convert("RGB") + img_array = np.array(image) + + result = self.ocr_engine.predict(input=img_array) + # result[0] 是一个 OCRResult 对象 + structured_blocks = self._parse_paddle_result(result[0] if result else None, page_num) + return structured_blocks + + def _parse_paddle_result(self, ocr_result, page_num: int) -> List[Dict[str, Any]]: + """ + 解析 PaddleOCR 3.x 返回的 OCRResult 对象。 + """ + blocks = [] + # 如果 ocr_result 为 None 或为空,直接返回 + if not ocr_result: + return blocks + + # 从 OCRResult 对象中,通过键名提取三个平行的列表 + boxes = ocr_result['dt_polys'] + texts = ocr_result['rec_texts'] + scores = ocr_result['rec_scores'] + + # 使用 zip() 函数同时遍历这三个列表 + for box, text, score in zip(boxes, texts, scores): + # 将多边形坐标(box)转换为简单的矩形边界框(bbox) + # box 是一个 numpy 数组,形状类似 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] + x_coords = [p[0] for p in box] + y_coords = [p[1] for p in box] + bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)] + + blocks.append({ + "type": "text", + "content": text, + "bbox": bbox, + "page_num": page_num, + "confidence": score + }) + + # 按从上到下、从左到右的阅读顺序对文本块进行排序 + return sorted(blocks, key=lambda b: (b['bbox'][1], b['bbox'][0])) + + def process_all_pages(self) -> List[Dict[str, Any]]: + all_content_blocks = [] + for i in range(len(self.doc)): + page_number = i + 1 + print(f"--- 正在处理第 {page_number}/{len(self.doc)} 页 ---") + blocks = self.process_page(page_number) + all_content_blocks.extend(blocks) + return all_content_blocks + + def close(self): + self.doc.close() + +# --- 主程序入口 --- +# --- 主程序入口 (包含保存和加载逻辑) --- +if __name__ == '__main__': + # 1. 定义输入和输出文件路径 + pdf_path = "../document/sample_bidding_document.pdf" + # 我们将把解析结果保存到这个JSON文件中 + parsed_output_path = "parsed_output.json" + + all_blocks = [] + + # 2. 检查解析结果是否已存在 + if os.path.exists(parsed_output_path): + print(f"✅ 找到了已保存的解析结果 '{parsed_output_path}',直接加载...") + with open(parsed_output_path, 'r', encoding='utf-8') as f: + all_blocks = json.load(f) + else: + print(f"未找到已保存的结果,启动完整的PDF解析流程...") + print(f"准备处理文件: {pdf_path}") + + try: + # 启动耗时的解析过程 + processor = PaddlePdfProcessor(pdf_path) + all_blocks = processor.process_all_pages() + processor.close() + + # 3. 将新解析的结果保存到文件中,供下次使用 + print(f"\n💾 正在将解析结果保存到 '{parsed_output_path}'...") + with open(parsed_output_path, 'w', encoding='utf-8') as f: + # indent=4 让JSON文件格式化,更易读 + # ensure_ascii=False 确保中文字符能被正确写入 + json.dump(all_blocks, f, indent=4, ensure_ascii=False) + print("保存成功!") + + except FileNotFoundError: + print(f"❌ 错误: 文件 '{pdf_path}' 未找到。") + except Exception as e: + print(f"❌ 程序在解析过程中发生严重错误: {e}") + traceback.print_exc() + + # 4. 无论数据是加载的还是新生成的,都可以在这里进行后续处理 + if all_blocks: + print("\n" + "="*50) + print(f"数据加载/处理完成,共获得 {len(all_blocks)} 个文本块。") + print("="*50) + + print("\n--- 处理结果预览 (前 5 个文本块) ---") + for block in all_blocks[:5]: + # 注意:json会把元组(tuple)存成列表(list),所以bbox现在是列表 + print(f"P{block['page_num']}: {block['content']} (置信度: {block['confidence']:.4f})") + + # 在这里调用您后续的分析函数,例如段落合并 + # print("\n--- 开始进行段落合并 ---") + # paragraphs = assemble_paragraphs(all_blocks) + # ... + else: + print("未能获取任何文本块,程序结束。") \ No newline at end of file