Langchain-Chatchat/document_loaders/customercore.py

62 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from unstructured.nlp.patterns import (
DOUBLE_PARAGRAPH_PATTERN_RE,
E_BULLET_PATTERN,
PARAGRAPH_PATTERN,
PARAGRAPH_PATTERN_RE,
UNICODE_BULLETS_RE,
)
from unstructured.cleaners.core import group_bullet_paragraph
def custom_group_broken_paragraphs(
text: str,
line_split: re.Pattern = PARAGRAPH_PATTERN_RE,
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
) -> str:
"""Groups paragraphs that have line breaks for visual/formatting purposes.
For example:
'''The big red fox
is walking down the lane.
At the end of the lane
the fox met a bear.'''
Gets converted to
'''The big red fox is walking down the lane.
At the end of the land the fox met a bear.'''
"""
paragraphs = paragraph_split.split(text)
clean_paragraphs = []
for paragraph in paragraphs:
if not paragraph.strip():
continue
# NOTE(robinson) - This block is to account for lines like the following that shouldn't be
# grouped together, but aren't separated by a double line break.
# Apache License
# Version 2.0, January 2004
# http://www.apache.org/licenses/
#para_split = line_split.split(paragraph)
# pytesseract converts some bullet points to standalone "e" characters
if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
tempList = group_bullet_paragraph(paragraph)
clean_paragraphs.extend(tempList)
#print(f"new 11111:{tempList}")
else:
tempList = re.sub(PARAGRAPH_PATTERN, " ", paragraph)
clean_paragraphs.append(tempList)
#print(f"new 333333:{tempList}")
return "\n\n".join(clean_paragraphs)
# str1 = "手工分段**绝缘装置10 工作斗在额定载荷下起升至最大平台高度制动后15 min, 工作斗下沉量应不超过该工况最大 平台高度的0.3%。"
# str2 = "手工分段**操控系统12 电气系统的要求如下:"
# custom_group_broken_paragraphs(str1)
# custom_group_broken_paragraphs(str2)