Langchain-Chatchat/document_loaders/mywordload.py

85 lines
3.6 KiB
Python

from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from docx import Document as docxDocument
from docx.document import Document as _Document
from docx.table import _Cell
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from unstructured.partition.text import partition_text
import unstructured.cleaners.core
from .customercore import custom_group_broken_paragraphs
unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs
class RapidWordLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph.
"""
#Document
if isinstance(parent, _Document):
parent_elm = parent._element.body
elif isinstance(parent, _Cell):
parent_elm = parent._element
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
def read_table(table):
# 获取表格列标题
headers = [cell.text.strip() for cell in table.rows[0].cells]
# 存储表格数据的字符串
table_string = ""
# 遍历表格行
for row_index, row in enumerate(table.rows[1:], 2): # 从第二行开始遍历,因为第一行是标题
row_data = []
# 遍历行中的单元格
for cell_index, cell in enumerate(row.cells, 1):
cell_text = cell.text.strip()
row_data.append(f'"{headers[cell_index - 1]}": "{cell_text}"')
# 将每一行的数据连接为字符串,用逗号分隔
row_string = ", ".join(row_data)
# 将每一行的字符串添加到总的表格字符串中
table_string += f"{{{row_string}}}\n"
return table_string
def word2text(filepath):
resp = ""
try:
doc = docxDocument(filepath)
for block in iter_block_items(doc):
if isinstance(block,Paragraph):
#print(f"Paragraph:{block.text}")
resp += (block.text + "\n\n")
elif isinstance(block, Table):
resp += read_table(block) + "\n"
except ValueError:
print(f"Error:input invalid parameter")
except Exception as e:
print(f"word2text error:{e}")
return resp
text = word2text(self.file_path)
listText = partition_text(text=text, **self.unstructured_kwargs)
return listText
if __name__ == "__main__":
loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")
#loader = Docx2txtLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")
#loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/MySelf/AI/Test/这是一个测试文档_副本2.docx")
docs = loader.load()
print(docs)