enhance RapidWordLoader
This commit is contained in:
parent
afa07ad208
commit
51424db243
|
|
@ -1,4 +1,5 @@
|
||||||
from .mypdfloader import RapidOCRPDFLoader
|
from .mypdfloader import RapidOCRPDFLoader
|
||||||
from .myimgloader import RapidOCRLoader
|
from .myimgloader import RapidOCRLoader
|
||||||
from .customiedpdfloader import CustomizedPDFLoader
|
from .customiedpdfloader import CustomizedPDFLoader
|
||||||
from.mywordload import RapidWordLoader
|
from .mywordload import RapidWordLoader
|
||||||
|
#from .customercore import custom_group_broken_paragraphs
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
from unstructured.nlp.patterns import (
|
||||||
|
DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||||
|
E_BULLET_PATTERN,
|
||||||
|
PARAGRAPH_PATTERN,
|
||||||
|
PARAGRAPH_PATTERN_RE,
|
||||||
|
UNICODE_BULLETS_RE,
|
||||||
|
)
|
||||||
|
from unstructured.cleaners.core import group_bullet_paragraph
|
||||||
|
|
||||||
|
def custom_group_broken_paragraphs(
|
||||||
|
text: str,
|
||||||
|
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
|
||||||
|
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||||
|
) -> str:
|
||||||
|
"""Groups paragraphs that have line breaks for visual/formatting purposes.
|
||||||
|
For example:
|
||||||
|
|
||||||
|
'''The big red fox
|
||||||
|
is walking down the lane.
|
||||||
|
|
||||||
|
At the end of the lane
|
||||||
|
the fox met a bear.'''
|
||||||
|
|
||||||
|
Gets converted to
|
||||||
|
|
||||||
|
'''The big red fox is walking down the lane.
|
||||||
|
At the end of the land the fox met a bear.'''
|
||||||
|
"""
|
||||||
|
paragraphs = paragraph_split.split(text)
|
||||||
|
clean_paragraphs = []
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if not paragraph.strip():
|
||||||
|
continue
|
||||||
|
# NOTE(robinson) - This block is to account for lines like the following that shouldn't be
|
||||||
|
# grouped together, but aren't separated by a double line break.
|
||||||
|
# Apache License
|
||||||
|
# Version 2.0, January 2004
|
||||||
|
# http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
#para_split = line_split.split(paragraph)
|
||||||
|
|
||||||
|
# pytesseract converts some bullet points to standalone "e" characters
|
||||||
|
if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
|
||||||
|
tempList = group_bullet_paragraph(paragraph)
|
||||||
|
clean_paragraphs.extend(tempList)
|
||||||
|
#print(f"new 11111:{tempList}")
|
||||||
|
else:
|
||||||
|
tempList = re.sub(PARAGRAPH_PATTERN, " ", paragraph)
|
||||||
|
clean_paragraphs.append(tempList)
|
||||||
|
#print(f"new 333333:{tempList}")
|
||||||
|
|
||||||
|
return "\n\n".join(clean_paragraphs)
|
||||||
|
|
||||||
|
|
||||||
|
# str1 = "手工分段**绝缘装置(10) 工作斗在额定载荷下起升至最大平台高度,制动后15 min, 工作斗下沉量应不超过该工况最大 平台高度的0.3%。"
|
||||||
|
# str2 = "手工分段**操控系统(12) 电气系统的要求如下:"
|
||||||
|
# custom_group_broken_paragraphs(str1)
|
||||||
|
# custom_group_broken_paragraphs(str2)
|
||||||
|
|
||||||
|
|
@ -7,8 +7,10 @@ from docx.oxml.text.paragraph import CT_P
|
||||||
from docx.oxml.table import CT_Tbl
|
from docx.oxml.table import CT_Tbl
|
||||||
from docx.table import _Cell, Table
|
from docx.table import _Cell, Table
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
#from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from unstructured.partition.text import partition_text
|
||||||
#from langchain.document_loaders.word_document import Docx2txtLoader
|
import unstructured.cleaners.core
|
||||||
|
from .customercore import custom_group_broken_paragraphs
|
||||||
|
unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs
|
||||||
|
|
||||||
class RapidWordLoader(UnstructuredFileLoader):
|
class RapidWordLoader(UnstructuredFileLoader):
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
|
|
@ -59,7 +61,6 @@ class RapidWordLoader(UnstructuredFileLoader):
|
||||||
doc = docxDocument(filepath)
|
doc = docxDocument(filepath)
|
||||||
for block in iter_block_items(doc):
|
for block in iter_block_items(doc):
|
||||||
if isinstance(block,Paragraph):
|
if isinstance(block,Paragraph):
|
||||||
|
|
||||||
#print(f"Paragraph:{block.text}")
|
#print(f"Paragraph:{block.text}")
|
||||||
resp += (block.text + "\n\n")
|
resp += (block.text + "\n\n")
|
||||||
elif isinstance(block, Table):
|
elif isinstance(block, Table):
|
||||||
|
|
@ -71,8 +72,8 @@ class RapidWordLoader(UnstructuredFileLoader):
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
text = word2text(self.file_path)
|
text = word2text(self.file_path)
|
||||||
from unstructured.partition.text import partition_text
|
listText = partition_text(text=text, **self.unstructured_kwargs)
|
||||||
return partition_text(text=text, paragraph_grouper = False, **self.unstructured_kwargs)
|
return listText
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")
|
loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx")
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
_good_splits = []
|
_good_splits = []
|
||||||
_separator = "" if self._keep_separator else separator
|
_separator = "" if self._keep_separator else separator
|
||||||
for s in splits:
|
for s in splits:
|
||||||
print(f"***s:{s},len:{self._length_function(s)}")
|
#print(f"***s:{s},len:{self._length_function(s)}")
|
||||||
if self._length_function(s) < self._chunk_size:
|
if self._length_function(s) < self._chunk_size:
|
||||||
_good_splits.append(s)
|
_good_splits.append(s)
|
||||||
#print(f"***_good_splits.append(s):{s}")
|
#print(f"***_good_splits.append(s):{s}")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue