customize word loader
This commit is contained in:
parent
9d8ee7717c
commit
565a94c1bb
|
|
@ -17,6 +17,9 @@ RERANKER_MODEL = "bge-reranker-large"
|
||||||
USE_RERANKER = False
|
USE_RERANKER = False
|
||||||
RERANKER_MAX_LENGTH = 1024
|
RERANKER_MAX_LENGTH = 1024
|
||||||
|
|
||||||
|
# 是否启用精排
|
||||||
|
USE_RANKING = False
|
||||||
|
|
||||||
# 如果需要在 EMBEDDING_MODEL 中增加自定义的关键字时配置
|
# 如果需要在 EMBEDDING_MODEL 中增加自定义的关键字时配置
|
||||||
EMBEDDING_KEYWORD_FILE = "keywords.txt"
|
EMBEDDING_KEYWORD_FILE = "keywords.txt"
|
||||||
EMBEDDING_MODEL_OUTPUT_PATH = "output"
|
EMBEDDING_MODEL_OUTPUT_PATH = "output"
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
from .mypdfloader import RapidOCRPDFLoader
|
from .mypdfloader import RapidOCRPDFLoader
|
||||||
from .myimgloader import RapidOCRLoader
|
from .myimgloader import RapidOCRLoader
|
||||||
from .customiedpdfloader import CustomizedPDFLoader
|
from .customiedpdfloader import CustomizedPDFLoader
|
||||||
|
from.mywordload import RapidWordLoader
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
from typing import List
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
from docx import Document as docxDocument
|
||||||
|
from docx.document import Document as _Document
|
||||||
|
from docx.table import _Cell
|
||||||
|
from docx.oxml.text.paragraph import CT_P
|
||||||
|
from docx.oxml.table import CT_Tbl
|
||||||
|
from docx.table import _Cell, Table
|
||||||
|
from docx.text.paragraph import Paragraph
|
||||||
|
|
||||||
|
class RapidWordLoader(UnstructuredFileLoader):
|
||||||
|
def _get_elements(self) -> List:
|
||||||
|
def iter_block_items(parent):
|
||||||
|
"""
|
||||||
|
Yield each paragraph and table child within *parent*, in document order.
|
||||||
|
Each returned value is an instance of either Table or Paragraph.
|
||||||
|
"""
|
||||||
|
#Document
|
||||||
|
if isinstance(parent, _Document):
|
||||||
|
parent_elm = parent._element.body
|
||||||
|
elif isinstance(parent, _Cell):
|
||||||
|
parent_elm = parent._element
|
||||||
|
else:
|
||||||
|
raise ValueError("something's not right")
|
||||||
|
|
||||||
|
for child in parent_elm.iterchildren():
|
||||||
|
if isinstance(child, CT_P):
|
||||||
|
yield Paragraph(child, parent)
|
||||||
|
elif isinstance(child, CT_Tbl):
|
||||||
|
yield Table(child, parent)
|
||||||
|
|
||||||
|
def read_table(table):
|
||||||
|
# 获取表格列标题
|
||||||
|
headers = [cell.text.strip() for cell in table.rows[0].cells]
|
||||||
|
# 存储表格数据的字符串
|
||||||
|
table_string = ""
|
||||||
|
|
||||||
|
# 遍历表格行
|
||||||
|
for row_index, row in enumerate(table.rows[1:], 2): # 从第二行开始遍历,因为第一行是标题
|
||||||
|
row_data = []
|
||||||
|
|
||||||
|
# 遍历行中的单元格
|
||||||
|
for cell_index, cell in enumerate(row.cells, 1):
|
||||||
|
cell_text = cell.text.strip()
|
||||||
|
row_data.append(f'"{headers[cell_index - 1]}": "{cell_text}"')
|
||||||
|
|
||||||
|
# 将每一行的数据连接为字符串,用逗号分隔
|
||||||
|
row_string = ", ".join(row_data)
|
||||||
|
# 将每一行的字符串添加到总的表格字符串中
|
||||||
|
table_string += f"{{{row_string}}}\n"
|
||||||
|
|
||||||
|
return table_string
|
||||||
|
|
||||||
|
def word2text(filepath):
|
||||||
|
resp = ""
|
||||||
|
try:
|
||||||
|
doc = docxDocument(filepath)
|
||||||
|
for block in iter_block_items(doc):
|
||||||
|
if isinstance(block,Paragraph):
|
||||||
|
resp += (block.text + "\n\n")
|
||||||
|
elif isinstance(block, Table):
|
||||||
|
resp += read_table(block) + "\n"
|
||||||
|
except ValueError:
|
||||||
|
print(f"Error:input invalid parameter")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"word2text error:{e}")
|
||||||
|
return resp
|
||||||
|
|
||||||
|
text = word2text(self.file_path)
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
return partition_text(text=text, **self.unstructured_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
loader = RapidWordLoader(file_path="/Users/wangvivi/Desktop/MySelf/AI/Test/国家电网公司供电企业组织机构规范标准.docx")
|
||||||
|
docs = loader.load()
|
||||||
|
print(docs)
|
||||||
|
|
@ -80,3 +80,4 @@ watchdog>=3.0.0
|
||||||
docx2txt
|
docx2txt
|
||||||
elasticsearch
|
elasticsearch
|
||||||
PyPDF2
|
PyPDF2
|
||||||
|
jieba
|
||||||
|
|
@ -17,6 +17,11 @@ from server.db.repository.knowledge_file_repository import get_file_detail
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
from configs import USE_RANKING
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def search_docs(
|
def search_docs(
|
||||||
|
|
@ -38,7 +43,42 @@ def search_docs(
|
||||||
print(f"search_docs, query:{query}")
|
print(f"search_docs, query:{query}")
|
||||||
docs = kb.search_docs(query, top_k, score_threshold)
|
docs = kb.search_docs(query, top_k, score_threshold)
|
||||||
print(f"search_docs, docs:{docs}")
|
print(f"search_docs, docs:{docs}")
|
||||||
|
|
||||||
|
if USE_RANKING:
|
||||||
|
queryList = []
|
||||||
|
queryList.append(query)
|
||||||
|
doc_contents = [doc[0].page_content for doc in docs]
|
||||||
|
|
||||||
|
doc_contents = [" ".join(jieba.cut(doc)) for doc in doc_contents]
|
||||||
|
queryList = [" ".join(jieba.cut(doc)) for doc in queryList]
|
||||||
|
|
||||||
|
#print(f"****** search_docs, doc_contents:{doc_contents}")
|
||||||
|
#print(f"****** search_docs, queryList:{queryList}")
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
tfidf_matrix = vectorizer.fit_transform(doc_contents)
|
||||||
|
print(f"****** search_docs, tfidf_matrix:{tfidf_matrix}")
|
||||||
|
query_vector = vectorizer.transform(queryList)
|
||||||
|
print(f"****** search_docs, query_vector:{query_vector}")
|
||||||
|
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
|
||||||
|
print(f"****** search_docs, cosine_similarities:{cosine_similarities}")
|
||||||
|
|
||||||
|
# 将相似度分数与文档结合
|
||||||
|
docs_with_scores = [(doc, score) for doc, score in zip(docs, cosine_similarities)]
|
||||||
|
sorted_docs = sorted(docs_with_scores, key=lambda x: x[1], reverse=True)
|
||||||
|
print(f"****** search_docs, sorted_docs:{sorted_docs}")
|
||||||
|
data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in sorted_docs]
|
||||||
|
|
||||||
|
docs_with_scores = [(doc, score) for doc, score in zip(docs, cosine_similarities)]
|
||||||
|
sorted_docs = sorted(docs_with_scores, key=lambda x: x[1], reverse=True)
|
||||||
|
print(f"****** search_docs, sorted_docs:{sorted_docs}")
|
||||||
|
data = [DocumentWithVSId(*x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in sorted_docs]
|
||||||
|
|
||||||
|
else:
|
||||||
|
#data = [DocumentWithScore(**doc[0].dict(), score=score) for doc, score in sorted_docs]
|
||||||
|
#data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
|
||||||
data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
|
data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
|
||||||
|
|
||||||
elif file_name or metadata:
|
elif file_name or metadata:
|
||||||
print(f"search_docs, kb:{knowledge_base_name}, filename:{file_name}")
|
print(f"search_docs, kb:{knowledge_base_name}, filename:{file_name}")
|
||||||
data = kb.list_docs(file_name=file_name, metadata=metadata)
|
data = kb.list_docs(file_name=file_name, metadata=metadata)
|
||||||
|
|
|
||||||
|
|
@ -122,7 +122,8 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
|
||||||
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
|
"UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
|
||||||
"EverNoteLoader": ['.enex'],
|
"EverNoteLoader": ['.enex'],
|
||||||
"UnstructuredFileLoader": ['.txt'],
|
"UnstructuredFileLoader": ['.txt'],
|
||||||
"Docx2txtLoader":['.docx','.doc'],
|
"Docx2txtLoader":['.doc'],
|
||||||
|
"RapidWordLoader":['.docx']
|
||||||
}
|
}
|
||||||
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
|
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
|
||||||
|
|
||||||
|
|
@ -162,7 +163,7 @@ def get_loader(loader_name: str, file_path: str, loader_kwargs: Dict = None):
|
||||||
'''
|
'''
|
||||||
loader_kwargs = loader_kwargs or {}
|
loader_kwargs = loader_kwargs or {}
|
||||||
try:
|
try:
|
||||||
if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader","FilteredCSVLoader"]:
|
if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader","FilteredCSVLoader","RapidWordLoader"]:
|
||||||
document_loaders_module = importlib.import_module('document_loaders')
|
document_loaders_module = importlib.import_module('document_loaders')
|
||||||
else:
|
else:
|
||||||
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
document_loaders_module = importlib.import_module('langchain.document_loaders')
|
||||||
|
|
|
||||||
|
|
@ -64,9 +64,10 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
|
text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
|
||||||
text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换
|
text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换
|
||||||
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章
|
text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章
|
||||||
|
#text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2
|
||||||
|
|
||||||
|
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2
|
||||||
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\n\n\1", text) # 通过\n1.2 这样的章和节来分块
|
text = re.sub(r'(\n+(?<!\.|[a-zA-Z0-9])[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\n\n\1", text) # 通过\n1.2 这样的章和节来分块
|
||||||
text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a
|
|
||||||
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||||
text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\1", text) # 通过第 条
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue