commit
7b7a180323
Binary file not shown.
Binary file not shown.
|
|
@ -1,2 +1,3 @@
|
|||
from .mypdfloader import RapidOCRPDFLoader
|
||||
from .myimgloader import RapidOCRLoader
|
||||
from .customiedpdfloader import CustomizedPDFLoader
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
from typing import List
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
import tqdm
|
||||
|
||||
|
||||
class CustomizedPDFLoader(UnstructuredFileLoader):
|
||||
def _get_elements(self) -> List:
|
||||
def pdf2text(filepath):
|
||||
import PyPDF2
|
||||
mypdf = open(filepath,mode='rb')
|
||||
doc = PyPDF2.PdfReader(mypdf)
|
||||
page_count = len(doc.pages)
|
||||
print(f"文档页数:{page_count}")
|
||||
|
||||
i = 0
|
||||
resp = ""
|
||||
b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0")
|
||||
while i < page_count:
|
||||
# 更新描述
|
||||
b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1))
|
||||
# 立即显示进度条更新结果
|
||||
b_unit.refresh()
|
||||
first_page = doc.pages[i]
|
||||
text= first_page.extract_text()
|
||||
resp += text + "\n"
|
||||
i = i+1
|
||||
|
||||
return resp
|
||||
|
||||
# def pdf2text(filepath):
|
||||
# import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆
|
||||
# from rapidocr_onnxruntime import RapidOCR
|
||||
# import numpy as np
|
||||
# ocr = RapidOCR()
|
||||
# doc = fitz.open(filepath)
|
||||
# resp = ""
|
||||
|
||||
# b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
|
||||
# for i, page in enumerate(doc):
|
||||
|
||||
# # 更新描述
|
||||
# b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
|
||||
# # 立即显示进度条更新结果
|
||||
# b_unit.refresh()
|
||||
# # TODO: 依据文本与图片顺序调整处理方式
|
||||
# text = page.get_text("")
|
||||
# resp += text + "\n"
|
||||
|
||||
# img_list = page.get_images()
|
||||
# for img in img_list:
|
||||
# pix = fitz.Pixmap(doc, img[0])
|
||||
# img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
|
||||
# result, _ = ocr(img_array)
|
||||
# if result:
|
||||
# ocr_result = [line[1] for line in result]
|
||||
# resp += "\n".join(ocr_result)
|
||||
|
||||
# # 更新进度
|
||||
# b_unit.update(1)
|
||||
# return resp
|
||||
|
||||
text = pdf2text(self.file_path)
|
||||
from unstructured.partition.text import partition_text
|
||||
return partition_text(text=text, **self.unstructured_kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf")
|
||||
docs = loader.load()
|
||||
print(docs)
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 146 KiB |
Binary file not shown.
|
|
@ -1,3 +1,7 @@
|
|||
# from .kb_api import list_kbs, create_kb, delete_kb
|
||||
# from .kb_doc_api import list_docs, upload_doc, delete_doc, update_doc, download_doc, recreate_vector_store
|
||||
# from .utils import KnowledgeFile, KBServiceFactory
|
||||
|
||||
from server.knowledge_base.kb_doc_api import *
|
||||
from server.knowledge_base.kb_api import *
|
||||
from server.knowledge_base.utils import *
|
||||
|
|
@ -11,8 +11,9 @@ from configs import (
|
|||
TEXT_SPLITTER_NAME,
|
||||
)
|
||||
import importlib
|
||||
from text_splitter import zh_title_enhance as func_zh_title_enhance
|
||||
from text_splitter import zh_second_title_enhance
|
||||
import langchain.document_loaders
|
||||
from langchain.document_loaders.word_document import Docx2txtLoader
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import TextSplitter
|
||||
from pathlib import Path
|
||||
|
|
@ -20,7 +21,7 @@ from server.utils import run_in_thread_pool, get_model_worker_config
|
|||
import json
|
||||
from typing import List, Union,Dict, Tuple, Generator
|
||||
import chardet
|
||||
|
||||
import re
|
||||
|
||||
def validate_kb_name(knowledge_base_id: str) -> bool:
|
||||
# 检查是否包含预期外的字符或路径攻击关键字
|
||||
|
|
@ -84,7 +85,7 @@ def list_files_from_folder(kb_name: str):
|
|||
|
||||
return result
|
||||
|
||||
|
||||
#PDFPlumberLoader
|
||||
LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
||||
"MHTMLLoader": ['.mhtml'],
|
||||
"UnstructuredMarkdownLoader": ['.md'],
|
||||
|
|
@ -105,11 +106,12 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
|||
"SRTLoader": ['.srt'],
|
||||
"TomlLoader": ['.toml'],
|
||||
"UnstructuredTSVLoader": ['.tsv'],
|
||||
"UnstructuredWordDocumentLoader": ['.docx', '.doc'],
|
||||
#"UnstructuredWordDocumentLoader": ['.docx', '.doc'],
|
||||
"UnstructuredXMLLoader": ['.xml'],
|
||||
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
|
||||
"EverNoteLoader": ['.enex'],
|
||||
"UnstructuredFileLoader": ['.txt'],
|
||||
"Docx2txtLoader":['.docx','.doc'],
|
||||
}
|
||||
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
|
||||
|
||||
|
|
@ -275,6 +277,11 @@ class KnowledgeFile:
|
|||
self.kb_name = knowledge_base_name
|
||||
self.filename = str(Path(filename).as_posix())
|
||||
self.ext = os.path.splitext(filename)[-1].lower()
|
||||
|
||||
#self.filename = filename
|
||||
#self.ext = os.path.splitext(filename)[-1].lower()
|
||||
self.doc_title_name, file_extension = os.path.splitext(filename)
|
||||
#self.ext = file_extension.lower()
|
||||
if self.ext not in SUPPORTED_EXTS:
|
||||
raise ValueError(f"暂未支持的文件格式 {self.filename}")
|
||||
self.loader_kwargs = loader_kwargs
|
||||
|
|
@ -283,6 +290,7 @@ class KnowledgeFile:
|
|||
self.splited_docs = None
|
||||
self.document_loader_name = get_LoaderClass(self.ext)
|
||||
self.text_splitter_name = TEXT_SPLITTER_NAME
|
||||
print(f"KnowledgeFile: filepath:{self.filepath}")
|
||||
|
||||
def file2docs(self, refresh: bool = False):
|
||||
if self.docs is None or refresh:
|
||||
|
|
@ -293,6 +301,8 @@ class KnowledgeFile:
|
|||
self.docs = loader.load()
|
||||
return self.docs
|
||||
|
||||
print(f"KnowledgeFile: filepath:{self.filepath}, doc_title_name:{self.doc_title_name}, ext:{self.ext}")
|
||||
|
||||
def docs2texts(
|
||||
self,
|
||||
docs: List[Document] = None,
|
||||
|
|
@ -302,7 +312,21 @@ class KnowledgeFile:
|
|||
chunk_overlap: int = OVERLAP_SIZE,
|
||||
text_splitter: TextSplitter = None,
|
||||
):
|
||||
def customize_zh_title_enhance(docs: Document) -> Document:
|
||||
if len(docs) > 0:
|
||||
for doc in docs:
|
||||
doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}"
|
||||
return docs
|
||||
else:
|
||||
print("文件不存在")
|
||||
|
||||
docs = docs or self.file2docs(refresh=refresh)
|
||||
#after loading, remove the redundant line break
|
||||
for doc in docs:
|
||||
if doc.page_content.strip()!="":
|
||||
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
||||
if not docs:
|
||||
return []
|
||||
if self.ext not in [".csv"]:
|
||||
|
|
@ -312,17 +336,39 @@ class KnowledgeFile:
|
|||
if self.text_splitter_name == "MarkdownHeaderTextSplitter":
|
||||
docs = text_splitter.split_text(docs[0].page_content)
|
||||
else:
|
||||
print(f"**********************docs2texts: text_splitter.split_documents(docs)")
|
||||
outputfile = file_name_without_extension + "_source.txt"
|
||||
with open(outputfile, 'w') as file:
|
||||
for doc in docs:
|
||||
file.write(doc.page_content)
|
||||
docs = text_splitter.split_documents(docs)
|
||||
|
||||
#print(f"文档切分示例:{docs[0]}")
|
||||
# print(f"KnowledgeFile: filepath:{self.filepath}")
|
||||
# file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||
# print("filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
||||
|
||||
if not docs:
|
||||
return []
|
||||
|
||||
print(f"文档切分示例:{docs[0]}")
|
||||
if zh_title_enhance:
|
||||
docs = func_zh_title_enhance(docs)
|
||||
docs = zh_second_title_enhance(docs)
|
||||
docs = customize_zh_title_enhance(docs)
|
||||
i = 1
|
||||
outputfile = file_name_without_extension + "_split.txt"
|
||||
# 打开文件以写入模式
|
||||
with open(outputfile, 'w') as file:
|
||||
for doc in docs:
|
||||
print(f"**********切分段{i}:{doc}")
|
||||
file.write(f"\n**********切分段{i}")
|
||||
file.write(doc.page_content)
|
||||
i = i+1
|
||||
|
||||
self.splited_docs = docs
|
||||
return self.splited_docs
|
||||
|
||||
|
||||
|
||||
def file2text(
|
||||
self,
|
||||
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,21 @@
|
|||
|
||||
from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
|
||||
from server.knowledge_base import KnowledgeFile
|
||||
|
||||
if __name__ == '__main__':
|
||||
from pprint import pprint
|
||||
|
||||
#kb_file = KnowledgeFile(filename="test.txt", knowledge_base_name="samples")
|
||||
# kb_file = KnowledgeFile(filename="国网安徽信通公司安全准入实施要求_修订.docx", knowledge_base_name="test")
|
||||
# docs = kb_file.file2docs()
|
||||
# pprint(docs[-1])
|
||||
# docs = kb_file.file2text()
|
||||
# pprint(docs[-1])
|
||||
|
||||
faissService = FaissKBService("test")
|
||||
faissService.add_doc(KnowledgeFile("电力电缆故障测寻车技术规范.docx", "test"))
|
||||
# faissService.delete_doc(KnowledgeFile("README.md", "test"))
|
||||
# faissService.do_drop_kb()
|
||||
#print(faissService.search_docs("准入手续的内容是什么?"))
|
||||
|
||||
|
||||
|
|
@ -2,3 +2,4 @@ from .chinese_text_splitter import ChineseTextSplitter
|
|||
from .ali_text_splitter import AliTextSplitter
|
||||
from .zh_title_enhance import zh_title_enhance
|
||||
from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
|
||||
from .zh_second_title_enhance import zh_second_title_enhance
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,113 @@
|
|||
from langchain.docstore.document import Document
|
||||
import re
|
||||
|
||||
def get_fist_level_title(
|
||||
text: str,
|
||||
) -> bool:
|
||||
# 文本长度为0的话或长度大于25,肯定不是title
|
||||
if len(text) == 0 and len (text)>= 25:
|
||||
print("Not a title. Text is empty or longer than 25.")
|
||||
return ""
|
||||
|
||||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
# 文本中有标点符号,就不是title
|
||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
||||
return ""
|
||||
|
||||
FIRST_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)\d+[^\S\n]+[^\s\.]+\S+)'
|
||||
TITLE_PUNCT_RE = re.compile(FIRST_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return first_line
|
||||
return ""
|
||||
|
||||
#return the 2nd level title
|
||||
def get_second_level_title(
|
||||
text: str,
|
||||
) -> str:
|
||||
# 文本长度为0的话,肯定不是title
|
||||
if len(text) == 0 and len (text)>= 25:
|
||||
print("Not a title. Text is empty or longer than 25.")
|
||||
return ""
|
||||
|
||||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
# 文本中有标点符号,就不是title
|
||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
if ENDS_IN_PUNCT_RE.search(first_line) is not None:
|
||||
return ""
|
||||
|
||||
Second_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
||||
TITLE_PUNCT_RE = re.compile(Second_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return first_line
|
||||
else:
|
||||
if len(splitlines)>1:
|
||||
Second_line = splitlines[1]
|
||||
if TITLE_PUNCT_RE.search(Second_line) is not None:
|
||||
return Second_line
|
||||
return ""
|
||||
|
||||
#judge if it is 3rd level content
|
||||
def is_third_level_content(
|
||||
text: str,
|
||||
) -> bool:
|
||||
# 文本长度为0的话,肯定不是title
|
||||
if len(text) == 0:
|
||||
print("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
splitlines = text.splitlines()
|
||||
first_line = splitlines[0]
|
||||
|
||||
Third_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)\s*[0-9]+\s*\.\s*[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
|
||||
TITLE_PUNCT_RE = re.compile(Third_TITLE)
|
||||
if TITLE_PUNCT_RE.search(first_line) is not None:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
#给三级被分开的内容 增加二级标题
|
||||
def zh_second_title_enhance(docs: Document) -> Document:
|
||||
title = None
|
||||
if len(docs) > 0:
|
||||
for doc in docs:
|
||||
second_title = get_second_level_title(doc.page_content)
|
||||
if second_title:
|
||||
title = second_title
|
||||
elif title:
|
||||
temp_third_content = is_third_level_content(doc.page_content)
|
||||
if temp_third_content:
|
||||
doc.page_content = f"{title} {doc.page_content}"
|
||||
else:
|
||||
title = None
|
||||
return docs
|
||||
else:
|
||||
print("文件不存在")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
str = """6 进出等电位
|
||||
6.1 直线塔进出等电位
|
||||
6.1.1 对于直线塔, 作业人员不得从横担或绝缘子串垂直进出等电位, 可采用吊篮(吊椅、吊梯) 法、 绝缘软梯法等方式进出等电位。
|
||||
6.1.2 等电位作业人员进出等电位时与接地体及带电体的各电气间隙距离(包括安全距离、组合间隙) 均应满足表 1 、3 要求。
|
||||
6.1.3 吊篮(吊椅、吊梯)必须用吊拉绳索稳固悬吊; 吊篮(吊椅、吊梯)的移动速度必须用绝缘滑 车组严格控制, 做到均匀、慢速; 固定吊拉绳索的长度, 应准确计算或实际丈量, 保证等电位作业人员 即将进入等电位时人体最高部位不超过导线侧均压环。
|
||||
6.2 耐张塔进出等电位
|
||||
6.2.1 在耐张塔进出等电位时,作业人员可采用沿耐张绝缘子串方法或其它方法进出等电位。
|
||||
6.2.2 等电位作业人员沿绝缘子串移动时, 手与脚的位置必须保持对应一致, 且人体和工具短接的绝 缘子片数应符合 5.2.2 的要求。
|
||||
6.2.3 等电位作业人员所系安全带,应绑在手扶的绝缘子串上,并与等电位作业人员同步移动。
|
||||
6.2.4 等电位作业人员在进出等电位时,应在移动至距离带电体 3 片绝缘子时进行电位转移,方可进 行后续操作。
|
||||
6.2.5 带电作业人员与接地体及带电体的各电气间隙距离(包括安全距离、组合间隙)和经人体或工 具短接后的良好绝缘子片数均应满足表 4 要求,否则不能沿耐张绝缘子串进出等电位。
|
||||
7 作业中的注意事项
|
||||
7.1 等电位作业人员在带电作业过程中时,应避免身体动作幅度过大。
|
||||
7.2 等电位作业人员与地电位作业人员之间传递物品应采用绝缘工具,绝缘工具的有效长度,应满足 表 2 的规定。
|
||||
7.3 屏蔽服装应无破损和孔洞, 各部分应连接良好、可靠。发现破损和毛刺时应送有资质的试验单位 进行屏蔽服装电阻和屏蔽效率测量,测量结果满足本标准 5.3.1 条的要求后,方可使用。
|
||||
7.4 绝缘工具在使用前, 应使用 2500V 及以上兆欧表进行分段检测(电极宽 2cm,极间宽 2cm),阻值 不低于 700MΩ。"""
|
||||
title = is_third_level_content(str)
|
||||
print(title)
|
||||
title = get_second_level_title(str)
|
||||
print(title)
|
||||
#zh_second_title_enhance()
|
||||
Binary file not shown.
Binary file not shown.
5
webui.py
5
webui.py
|
|
@ -21,7 +21,7 @@ if __name__ == "__main__":
|
|||
menu_items={
|
||||
'Get Help': 'https://github.com/chatchat-space/Langchain-Chatchat',
|
||||
'Report a bug': "https://github.com/chatchat-space/Langchain-Chatchat/issues",
|
||||
'About': f"""欢迎使用 Langchain-Chatchat WebUI {VERSION}!"""
|
||||
'About': f"""欢迎使用 思极大模型 WebUI {VERSION}!"""
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -40,7 +40,8 @@ if __name__ == "__main__":
|
|||
st.image(
|
||||
os.path.join(
|
||||
"img",
|
||||
"logo-long-chatchat-trans-v2.png"
|
||||
"siji.jpg"
|
||||
#"logo-long-chatchat-trans-v2.png"
|
||||
),
|
||||
use_column_width=True
|
||||
)
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue