62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import re
|
||
|
||
from unstructured.nlp.patterns import (
|
||
DOUBLE_PARAGRAPH_PATTERN_RE,
|
||
E_BULLET_PATTERN,
|
||
PARAGRAPH_PATTERN,
|
||
PARAGRAPH_PATTERN_RE,
|
||
UNICODE_BULLETS_RE,
|
||
)
|
||
from unstructured.cleaners.core import group_bullet_paragraph
|
||
|
||
def custom_group_broken_paragraphs(
|
||
text: str,
|
||
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
|
||
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
|
||
) -> str:
|
||
"""Groups paragraphs that have line breaks for visual/formatting purposes.
|
||
For example:
|
||
|
||
'''The big red fox
|
||
is walking down the lane.
|
||
|
||
At the end of the lane
|
||
the fox met a bear.'''
|
||
|
||
Gets converted to
|
||
|
||
'''The big red fox is walking down the lane.
|
||
At the end of the land the fox met a bear.'''
|
||
"""
|
||
paragraphs = paragraph_split.split(text)
|
||
clean_paragraphs = []
|
||
for paragraph in paragraphs:
|
||
if not paragraph.strip():
|
||
continue
|
||
# NOTE(robinson) - This block is to account for lines like the following that shouldn't be
|
||
# grouped together, but aren't separated by a double line break.
|
||
# Apache License
|
||
# Version 2.0, January 2004
|
||
# http://www.apache.org/licenses/
|
||
|
||
#para_split = line_split.split(paragraph)
|
||
|
||
# pytesseract converts some bullet points to standalone "e" characters
|
||
if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
|
||
tempList = group_bullet_paragraph(paragraph)
|
||
clean_paragraphs.extend(tempList)
|
||
#print(f"new 11111:{tempList}")
|
||
else:
|
||
tempList = re.sub(PARAGRAPH_PATTERN, " ", paragraph)
|
||
clean_paragraphs.append(tempList)
|
||
#print(f"new 333333:{tempList}")
|
||
|
||
return "\n\n".join(clean_paragraphs)
|
||
|
||
|
||
# str1 = "手工分段**绝缘装置(10) 工作斗在额定载荷下起升至最大平台高度,制动后15 min, 工作斗下沉量应不超过该工况最大 平台高度的0.3%。"
|
||
# str2 = "手工分段**操控系统(12) 电气系统的要求如下:"
|
||
# custom_group_broken_paragraphs(str1)
|
||
# custom_group_broken_paragraphs(str2)
|
||
|