enhance RapidWordLoader

This commit is contained in:
wvivi2023 2024-01-17 10:49:59 +08:00
parent afa07ad208
commit 51424db243
4 changed files with 70 additions and 7 deletions

View File

@ -1,4 +1,5 @@
from .mypdfloader import RapidOCRPDFLoader from .mypdfloader import RapidOCRPDFLoader
from .myimgloader import RapidOCRLoader from .myimgloader import RapidOCRLoader
from .customiedpdfloader import CustomizedPDFLoader from .customiedpdfloader import CustomizedPDFLoader
from.mywordload import RapidWordLoader from .mywordload import RapidWordLoader
#from .customercore import custom_group_broken_paragraphs

View File

@ -0,0 +1,61 @@
import re
from unstructured.nlp.patterns import (
DOUBLE_PARAGRAPH_PATTERN_RE,
E_BULLET_PATTERN,
PARAGRAPH_PATTERN,
PARAGRAPH_PATTERN_RE,
UNICODE_BULLETS_RE,
)
from unstructured.cleaners.core import group_bullet_paragraph
def custom_group_broken_paragraphs(
text: str,
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
) -> str:
"""Groups paragraphs that have line breaks for visual/formatting purposes.
For example:
'''The big red fox
is walking down the lane.
At the end of the lane
the fox met a bear.'''
Gets converted to
'''The big red fox is walking down the lane.
At the end of the land the fox met a bear.'''
"""
paragraphs = paragraph_split.split(text)
clean_paragraphs = []
for paragraph in paragraphs:
if not paragraph.strip():
continue
# NOTE(robinson) - This block is to account for lines like the following that shouldn't be
# grouped together, but aren't separated by a double line break.
# Apache License
# Version 2.0, January 2004
# http://www.apache.org/licenses/
#para_split = line_split.split(paragraph)
# pytesseract converts some bullet points to standalone "e" characters
if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
tempList = group_bullet_paragraph(paragraph)
clean_paragraphs.extend(tempList)
#print(f"new 11111:{tempList}")
else:
tempList = re.sub(PARAGRAPH_PATTERN, " ", paragraph)
clean_paragraphs.append(tempList)
#print(f"new 333333:{tempList}")
return "\n\n".join(clean_paragraphs)
# str1 = "手工分段**绝缘装置10 工作斗在额定载荷下起升至最大平台高度制动后15 min, 工作斗下沉量应不超过该工况最大 平台高度的0.3%。"
# str2 = "手工分段**操控系统12 电气系统的要求如下:"
# custom_group_broken_paragraphs(str1)
# custom_group_broken_paragraphs(str2)

View File

@ -7,8 +7,10 @@ from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
#from langchain.document_loaders.unstructured import UnstructuredFileLoader from unstructured.partition.text import partition_text
#from langchain.document_loaders.word_document import Docx2txtLoader import unstructured.cleaners.core
from .customercore import custom_group_broken_paragraphs
unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs
class RapidWordLoader(UnstructuredFileLoader): class RapidWordLoader(UnstructuredFileLoader):
def _get_elements(self) -> List: def _get_elements(self) -> List:
@ -59,7 +61,6 @@ class RapidWordLoader(UnstructuredFileLoader):
doc = docxDocument(filepath) doc = docxDocument(filepath)
for block in iter_block_items(doc): for block in iter_block_items(doc):
if isinstance(block,Paragraph): if isinstance(block,Paragraph):
#print(f"Paragraph:{block.text}") #print(f"Paragraph:{block.text}")
resp += (block.text + "\n\n") resp += (block.text + "\n\n")
elif isinstance(block, Table): elif isinstance(block, Table):
@ -71,8 +72,8 @@ class RapidWordLoader(UnstructuredFileLoader):
return resp return resp
text = word2text(self.file_path) text = word2text(self.file_path)
from unstructured.partition.text import partition_text listText = partition_text(text=text, **self.unstructured_kwargs)
return partition_text(text=text, paragraph_grouper = False, **self.unstructured_kwargs) return listText
if __name__ == "__main__": if __name__ == "__main__":
loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx") loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")

View File

@ -91,7 +91,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
_good_splits = [] _good_splits = []
_separator = "" if self._keep_separator else separator _separator = "" if self._keep_separator else separator
for s in splits: for s in splits:
print(f"***s:{s},len:{self._length_function(s)}") #print(f"***s:{s},len:{self._length_function(s)}")
if self._length_function(s) < self._chunk_size: if self._length_function(s) < self._chunk_size:
_good_splits.append(s) _good_splits.append(s)
#print(f"***_good_splits.append(s):{s}") #print(f"***_good_splits.append(s):{s}")