diff --git a/document_loaders/__init__.py b/document_loaders/__init__.py index b1de210..8ad1da6 100644 --- a/document_loaders/__init__.py +++ b/document_loaders/__init__.py @@ -1,4 +1,5 @@ from .mypdfloader import RapidOCRPDFLoader from .myimgloader import RapidOCRLoader from .customiedpdfloader import CustomizedPDFLoader -from.mywordload import RapidWordLoader \ No newline at end of file +from .mywordload import RapidWordLoader +#from .customercore import custom_group_broken_paragraphs diff --git a/document_loaders/customercore.py b/document_loaders/customercore.py new file mode 100644 index 0000000..de5bdba --- /dev/null +++ b/document_loaders/customercore.py @@ -0,0 +1,61 @@ +import re + +from unstructured.nlp.patterns import ( + DOUBLE_PARAGRAPH_PATTERN_RE, + E_BULLET_PATTERN, + PARAGRAPH_PATTERN, + PARAGRAPH_PATTERN_RE, + UNICODE_BULLETS_RE, +) +from unstructured.cleaners.core import group_bullet_paragraph + +def custom_group_broken_paragraphs( + text: str, + line_split: re.Pattern = PARAGRAPH_PATTERN_RE, + paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE, +) -> str: + """Groups paragraphs that have line breaks for visual/formatting purposes. + For example: + + '''The big red fox + is walking down the lane. + + At the end of the lane + the fox met a bear.''' + + Gets converted to + + '''The big red fox is walking down the lane. + At the end of the land the fox met a bear.''' + """ + paragraphs = paragraph_split.split(text) + clean_paragraphs = [] + for paragraph in paragraphs: + if not paragraph.strip(): + continue + # NOTE(robinson) - This block is to account for lines like the following that shouldn't be + # grouped together, but aren't separated by a double line break. + # Apache License + # Version 2.0, January 2004 + # http://www.apache.org/licenses/ + + #para_split = line_split.split(paragraph) + + # pytesseract converts some bullet points to standalone "e" characters + if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()): + tempList = group_bullet_paragraph(paragraph) + clean_paragraphs.extend(tempList) + #print(f"new 11111:{tempList}") + else: + tempList = re.sub(PARAGRAPH_PATTERN, " ", paragraph) + clean_paragraphs.append(tempList) + #print(f"new 333333:{tempList}") + + return "\n\n".join(clean_paragraphs) + + +# str1 = "手工分段**绝缘装置(10) 工作斗在额定载荷下起升至最大平台高度,制动后15 min, 工作斗下沉量应不超过该工况最大 平台高度的0.3%。" +# str2 = "手工分段**操控系统(12) 电气系统的要求如下:" +# custom_group_broken_paragraphs(str1) +# custom_group_broken_paragraphs(str2) + diff --git a/document_loaders/mywordload.py b/document_loaders/mywordload.py index 0f79386..9c587d8 100644 --- a/document_loaders/mywordload.py +++ b/document_loaders/mywordload.py @@ -7,8 +7,10 @@ from docx.oxml.text.paragraph import CT_P from docx.oxml.table import CT_Tbl from docx.table import _Cell, Table from docx.text.paragraph import Paragraph -#from langchain.document_loaders.unstructured import UnstructuredFileLoader -#from langchain.document_loaders.word_document import Docx2txtLoader +from unstructured.partition.text import partition_text +import unstructured.cleaners.core +from .customercore import custom_group_broken_paragraphs +unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs class RapidWordLoader(UnstructuredFileLoader): def _get_elements(self) -> List: @@ -59,7 +61,6 @@ class RapidWordLoader(UnstructuredFileLoader): doc = docxDocument(filepath) for block in iter_block_items(doc): if isinstance(block,Paragraph): - #print(f"Paragraph:{block.text}") resp += (block.text + "\n\n") elif isinstance(block, Table): @@ -71,8 +72,8 @@ class RapidWordLoader(UnstructuredFileLoader): return resp text = word2text(self.file_path) - from unstructured.partition.text import partition_text - return partition_text(text=text, paragraph_grouper = False, **self.unstructured_kwargs) + listText = partition_text(text=text, **self.unstructured_kwargs) + return listText if __name__ == "__main__": loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/Work/思极GPT/数字化部/设备类all/sb389/10kV带电作业用绝缘斗臂车.docx") diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 66ab041..0a9f232 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -91,7 +91,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): _good_splits = [] _separator = "" if self._keep_separator else separator for s in splits: - print(f"***s:{s},len:{self._length_function(s)}") + #print(f"***s:{s},len:{self._length_function(s)}") if self._length_function(s) < self._chunk_size: _good_splits.append(s) #print(f"***_good_splits.append(s):{s}")