import re

from unstructured.nlp.patterns import (
    DOUBLE_PARAGRAPH_PATTERN_RE,
    E_BULLET_PATTERN,
    PARAGRAPH_PATTERN,
    PARAGRAPH_PATTERN_RE,
    UNICODE_BULLETS_RE,
)
from unstructured.cleaners.core import group_bullet_paragraph

def custom_group_broken_paragraphs(
    text: str,
    line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
    paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
) -> str:
    """Groups paragraphs that have line breaks for visual/formatting purposes.
    For example:

    '''The big red fox
    is walking down the lane.

    At the end of the lane
    the fox met a bear.'''

    Gets converted to

    '''The big red fox is walking down the lane.
    At the end of the land the fox met a bear.'''
    """
    paragraphs = paragraph_split.split(text)
    clean_paragraphs = []
    for paragraph in paragraphs:
        if not paragraph.strip():
            continue
        # NOTE(robinson) - This block is to account for lines like the following that shouldn't be
        # grouped together, but aren't separated by a double line break.
        #     Apache License
        #     Version 2.0, January 2004
        #     http://www.apache.org/licenses/

        #para_split = line_split.split(paragraph)

        # pytesseract converts some bullet points to standalone "e" characters
        if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
            tempList = group_bullet_paragraph(paragraph)
            clean_paragraphs.extend(tempList)
            #print(f"new 11111:{tempList}")
        else:
            tempList = re.sub(PARAGRAPH_PATTERN, " ", paragraph)
            clean_paragraphs.append(tempList)
            #print(f"new 333333:{tempList}")

    return "\n\n".join(clean_paragraphs)


# str1 = "手工分段**绝缘装置（10）  工作斗在额定载荷下起升至最大平台高度，制动后15 min, 工作斗下沉量应不超过该工况最大 平台高度的0.3%。"
# str2 = "手工分段**操控系统（12） 电气系统的要求如下："
# custom_group_broken_paragraphs(str1)
# custom_group_broken_paragraphs(str2)