diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 582265c..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/document_loaders/__init__.py b/document_loaders/__init__.py index a4d6b28..22340ae 100644 --- a/document_loaders/__init__.py +++ b/document_loaders/__init__.py @@ -1,2 +1,3 @@ from .mypdfloader import RapidOCRPDFLoader -from .myimgloader import RapidOCRLoader \ No newline at end of file +from .myimgloader import RapidOCRLoader +from .customiedpdfloader import CustomizedPDFLoader \ No newline at end of file diff --git a/document_loaders/customiedpdfloader.py b/document_loaders/customiedpdfloader.py new file mode 100644 index 0000000..cd3b6f3 --- /dev/null +++ b/document_loaders/customiedpdfloader.py @@ -0,0 +1,70 @@ +from typing import List +from langchain.document_loaders.unstructured import UnstructuredFileLoader +import tqdm + + +class CustomizedPDFLoader(UnstructuredFileLoader): + def _get_elements(self) -> List: + def pdf2text(filepath): + import PyPDF2 + mypdf = open(filepath,mode='rb') + doc = PyPDF2.PdfReader(mypdf) + page_count = len(doc.pages) + print(f"文档页数:{page_count}") + + i = 0 + resp = "" + b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0") + while i < page_count: + # 更新描述 + b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1)) + # 立即显示进度条更新结果 + b_unit.refresh() + first_page = doc.pages[i] + text= first_page.extract_text() + resp += text + "\n" + i = i+1 + + return resp + + # def pdf2text(filepath): + # import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆 + # from rapidocr_onnxruntime import RapidOCR + # import numpy as np + # ocr = RapidOCR() + # doc = fitz.open(filepath) + # resp = "" + + # b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") + # for i, page in enumerate(doc): + + # # 更新描述 + # b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) + # # 立即显示进度条更新结果 + # b_unit.refresh() + # # TODO: 依据文本与图片顺序调整处理方式 + # text = page.get_text("") + # resp += text + "\n" + + # img_list = page.get_images() + # for img in img_list: + # pix = fitz.Pixmap(doc, img[0]) + # img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) + # result, _ = ocr(img_array) + # if result: + # ocr_result = [line[1] for line in result] + # resp += "\n".join(ocr_result) + + # # 更新进度 + # b_unit.update(1) + # return resp + + text = pdf2text(self.file_path) + from unstructured.partition.text import partition_text + return partition_text(text=text, **self.unstructured_kwargs) + + +if __name__ == "__main__": + loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf") + docs = loader.load() + print(docs) diff --git a/document_loaders/mypdfloader.py b/document_loaders/mypdfloader.py index 6cb7726..e3e05b9 100644 --- a/document_loaders/mypdfloader.py +++ b/document_loaders/mypdfloader.py @@ -1,7 +1,7 @@ from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader import tqdm - +import os class RapidOCRPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: @@ -13,29 +13,40 @@ class RapidOCRPDFLoader(UnstructuredFileLoader): doc = fitz.open(filepath) resp = "" + file_name_without_extension, file_extension = os.path.splitext(filepath) + b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") - for i, page in enumerate(doc): + outputfile = file_name_without_extension + "_scan.txt" + # 打开文件以写入模式 + with open(outputfile, 'w') as file: + + for i, page in enumerate(doc): - # 更新描述 - b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) - # 立即显示进度条更新结果 - b_unit.refresh() - # TODO: 依据文本与图片顺序调整处理方式 - text = page.get_text("") - resp += text + "\n" + # 更新描述 + b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) + # 立即显示进度条更新结果 + b_unit.refresh() + # TODO: 依据文本与图片顺序调整处理方式 + text = page.get_text("") + file.write(f"\n**********文字,页码:{i}") + file.write(text) + resp += text + "\n" - img_list = page.get_images() - for img in img_list: - pix = fitz.Pixmap(doc, img[0]) - img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) - result, _ = ocr(img_array) - if result: - ocr_result = [line[1] for line in result] - resp += "\n".join(ocr_result) - # 更新进度 - b_unit.update(1) - return resp + img_list = page.get_images() + for img in img_list: + pix = fitz.Pixmap(doc, img[0]) + img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) + result, _ = ocr(img_array) + if result: + ocr_result = [line[1] for line in result] + file.write(f"\n*****图片****,页码:{i}") + file.write(ocr_result) + resp += "\n".join(ocr_result) + + # 更新进度 + b_unit.update(1) + return resp text = pdf2text(self.file_path) from unstructured.partition.text import partition_text diff --git a/server/.DS_Store b/server/.DS_Store deleted file mode 100644 index 5a32a3e..0000000 Binary files a/server/.DS_Store and /dev/null differ diff --git a/server/knowledge_base/.DS_Store b/server/knowledge_base/.DS_Store deleted file mode 100644 index 4030beb..0000000 Binary files a/server/knowledge_base/.DS_Store and /dev/null differ diff --git a/server/knowledge_base/kb_service/.DS_Store b/server/knowledge_base/kb_service/.DS_Store deleted file mode 100644 index f5068e3..0000000 Binary files a/server/knowledge_base/kb_service/.DS_Store and /dev/null differ diff --git a/server/knowledge_base/kb_service/knowledge_base/.DS_Store b/server/knowledge_base/kb_service/knowledge_base/.DS_Store deleted file mode 100644 index 32904a3..0000000 Binary files a/server/knowledge_base/kb_service/knowledge_base/.DS_Store and /dev/null differ diff --git a/server/knowledge_base/kb_service/knowledge_base/test/.DS_Store b/server/knowledge_base/kb_service/knowledge_base/test/.DS_Store deleted file mode 100644 index e0859a5..0000000 Binary files a/server/knowledge_base/kb_service/knowledge_base/test/.DS_Store and /dev/null differ diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index f25a706..e91587b 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -73,7 +73,7 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'], "UnstructuredMarkdownLoader": ['.md'], "CustomJSONLoader": [".json"], "CSVLoader": [".csv"], - "RapidOCRPDFLoader": [".pdf"], + "CustomizedPDFLoader": [".pdf"], "RapidOCRLoader": ['.png', '.jpg', '.jpeg', '.bmp'], "UnstructuredFileLoader": ['.eml', '.msg', '.rst', '.rtf', '.txt', '.xml', @@ -152,7 +152,7 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri 根据loader_name和文件路径或内容返回文档加载器。 ''' try: - if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader"]: + if loader_name in ["RapidOCRPDFLoader", "RapidOCRLoader", "CustomizedPDFLoader"]: document_loaders_module = importlib.import_module('document_loaders') else: document_loaders_module = importlib.import_module('langchain.document_loaders') diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 480d6ca..b163504 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -37,6 +37,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): """Create a new TextSplitter.""" super().__init__(keep_separator=keep_separator, **kwargs) self._separators = separators or [ + SPLIT_SEPARATOE, SPLIT_SEPARATOE, #"\n\n", #"\n", @@ -54,7 +55,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): # Get appropriate separator to use separator = separators[-1] new_separators = [SPLIT_SEPARATOE] - text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2这样的章和节来分块 + text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过1.2 这样的章和节来分块 text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.4.a text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条 text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 条 @@ -88,7 +89,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): if not new_separators: final_chunks.append(s) else: - text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块 + s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s) # 再通过 1.2.3 来分块 other_info = self._split_text(s, new_separators) final_chunks.extend(other_info) if _good_splits: diff --git a/webui_pages/.DS_Store b/webui_pages/.DS_Store deleted file mode 100644 index 2ff7808..0000000 Binary files a/webui_pages/.DS_Store and /dev/null differ