roll back the last commit
This commit is contained in:
parent
1d12f84310
commit
5a9c25d010
|
|
@ -2,4 +2,4 @@ from .mypdfloader import RapidOCRPDFLoader
|
||||||
from .myimgloader import RapidOCRLoader
|
from .myimgloader import RapidOCRLoader
|
||||||
from .customiedpdfloader import CustomizedPDFLoader
|
from .customiedpdfloader import CustomizedPDFLoader
|
||||||
from .mywordload import RapidWordLoader
|
from .mywordload import RapidWordLoader
|
||||||
from .customercore import custom_group_broken_paragraphs
|
#from .customercore import custom_group_broken_paragraphs
|
||||||
|
|
|
||||||
|
|
@ -5,12 +5,12 @@ from docx.document import Document as _Document
|
||||||
from docx.table import _Cell
|
from docx.table import _Cell
|
||||||
from docx.oxml.text.paragraph import CT_P
|
from docx.oxml.text.paragraph import CT_P
|
||||||
from docx.oxml.table import CT_Tbl
|
from docx.oxml.table import CT_Tbl
|
||||||
from docx.oxml.table import CT_TblGrid
|
#from docx.oxml.table import CT_TblGrid
|
||||||
from docx.table import _Cell, Table
|
from docx.table import _Cell, Table
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import partition_text
|
||||||
import unstructured.cleaners.core
|
import unstructured.cleaners.core
|
||||||
from customercore import custom_group_broken_paragraphs
|
from .customercore import custom_group_broken_paragraphs
|
||||||
unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs
|
unstructured.cleaners.core.group_broken_paragraphs = custom_group_broken_paragraphs
|
||||||
|
|
||||||
class RapidWordLoader(UnstructuredFileLoader):
|
class RapidWordLoader(UnstructuredFileLoader):
|
||||||
|
|
@ -33,10 +33,10 @@ class RapidWordLoader(UnstructuredFileLoader):
|
||||||
yield Paragraph(child, parent)
|
yield Paragraph(child, parent)
|
||||||
elif isinstance(child, CT_Tbl):
|
elif isinstance(child, CT_Tbl):
|
||||||
yield Table(child, parent)
|
yield Table(child, parent)
|
||||||
elif isinstance(child, CT_TblGrid):
|
# elif isinstance(child, CT_TblGrid):
|
||||||
yield Table(child, parent)
|
# yield Table(child, parent)
|
||||||
else:
|
# else:
|
||||||
print(f"都不属于")
|
# print(f"都不属于")
|
||||||
|
|
||||||
def read_table(table):
|
def read_table(table):
|
||||||
# 获取表格列标题
|
# 获取表格列标题
|
||||||
|
|
@ -66,7 +66,7 @@ class RapidWordLoader(UnstructuredFileLoader):
|
||||||
doc = docxDocument(filepath)
|
doc = docxDocument(filepath)
|
||||||
for block in iter_block_items(doc):
|
for block in iter_block_items(doc):
|
||||||
if isinstance(block,Paragraph):
|
if isinstance(block,Paragraph):
|
||||||
print(f"Paragraph:{block.text}")
|
#print(f"Paragraph:{block.text}")
|
||||||
resp += (block.text + "\n\n")
|
resp += (block.text + "\n\n")
|
||||||
elif isinstance(block, Table):
|
elif isinstance(block, Table):
|
||||||
resp += read_table(block) + "\n"
|
resp += read_table(block) + "\n"
|
||||||
|
|
|
||||||
|
|
@ -9,8 +9,7 @@ from configs import (LLM_MODELS, LLM_DEVICE, EMBEDDING_DEVICE,
|
||||||
FSCHAT_MODEL_WORKERS, HTTPX_DEFAULT_TIMEOUT)
|
FSCHAT_MODEL_WORKERS, HTTPX_DEFAULT_TIMEOUT)
|
||||||
import os
|
import os
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
#from langchain.chat_models import ChatOpenAI
|
from langchain.chat_models import ChatOpenAI
|
||||||
from langchain._api import ChatOpenAI
|
|
||||||
from langchain.llms import OpenAI, AzureOpenAI, Anthropic
|
from langchain.llms import OpenAI, AzureOpenAI, Anthropic
|
||||||
import httpx
|
import httpx
|
||||||
from typing import Literal, Optional, Callable, Generator, Dict, Any, Awaitable, Union, Tuple
|
from typing import Literal, Optional, Callable, Generator, Dict, Any, Awaitable, Union, Tuple
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue