merge 0.2.6

2024-01-02 10:10:41 +08:00 · 2024-01-02 10:10:41 +08:00 · 7b7a180323
parent 5d6111dd26 5c8610f47f
commit 7b7a180323
19 changed files with 594 additions and 29 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.github/.DS_Store
+++ b/.github/.DS_Store
--- a/configs/.DS_Store
+++ b/configs/.DS_Store
--- a/document_loaders/init.py
+++ b/document_loaders/init.py
@ -1,2 +1,3 @@
 from .mypdfloader import RapidOCRPDFLoader
 from .myimgloader import RapidOCRLoader
+from .customiedpdfloader import CustomizedPDFLoader
--- a/document_loaders/customiedpdfloader.py
+++ b/document_loaders/customiedpdfloader.py
@ -0,0 +1,70 @@
+from typing import List
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
+import tqdm
+
+
+class CustomizedPDFLoader(UnstructuredFileLoader):
+    def _get_elements(self) -> List:
+        def pdf2text(filepath):
+            import PyPDF2
+            mypdf = open(filepath,mode='rb')
+            doc = PyPDF2.PdfReader(mypdf)
+            page_count = len(doc.pages)
+            print(f"文档页数:{page_count}")
+ 
+            i = 0
+            resp = ""
+            b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0")
+            while i < page_count:   
+                # 更新描述
+                b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1))
+                # 立即显示进度条更新结果
+                b_unit.refresh()
+                first_page = doc.pages[i]
+                text= first_page.extract_text()
+                resp += text + "\n"
+                i = i+1
+        
+            return resp
+
+        # def pdf2text(filepath):
+        #     import fitz # pyMuPDF里面的fitz包，不要与pip install fitz混淆
+        #     from rapidocr_onnxruntime import RapidOCR
+        #     import numpy as np
+        #     ocr = RapidOCR()
+        #     doc = fitz.open(filepath)
+        #     resp = ""
+
+        #     b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
+        #     for i, page in enumerate(doc):
+
+        #         # 更新描述
+        #         b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
+        #         # 立即显示进度条更新结果
+        #         b_unit.refresh()
+        #         # TODO: 依据文本与图片顺序调整处理方式
+        #         text = page.get_text("")
+        #         resp += text + "\n"
+
+        #         img_list = page.get_images()
+        #         for img in img_list:
+        #             pix = fitz.Pixmap(doc, img[0])
+        #             img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
+        #             result, _ = ocr(img_array)
+        #             if result:
+        #                 ocr_result = [line[1] for line in result]
+        #                 resp += "\n".join(ocr_result)
+
+        #         # 更新进度
+        #         b_unit.update(1)
+        #     return resp
+
+        text = pdf2text(self.file_path)
+        from unstructured.partition.text import partition_text
+        return partition_text(text=text, **self.unstructured_kwargs)
+
+
+if __name__ == "__main__":
+    loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf")
+    docs = loader.load()
+    print(docs)
--- a/img/siji.jpg
+++ b/img/siji.jpg
--- a/server/knowledge_base/.DS_Store
+++ b/server/knowledge_base/.DS_Store
--- a/server/knowledge_base/init.py
+++ b/server/knowledge_base/init.py
@ -1,3 +1,7 @@
 # from .kb_api import list_kbs, create_kb, delete_kb
 # from .kb_doc_api import list_docs, upload_doc, delete_doc, update_doc, download_doc, recreate_vector_store
 # from .utils import KnowledgeFile, KBServiceFactory
+
+from server.knowledge_base.kb_doc_api import *
+from server.knowledge_base.kb_api import  *
+from server.knowledge_base.utils import *
--- a/server/knowledge_base/utils.py
+++ b/server/knowledge_base/utils.py
@ -11,8 +11,9 @@ from configs import (
    TEXT_SPLITTER_NAME,
 )
 import importlib
-from text_splitter import zh_title_enhance as func_zh_title_enhance
+from text_splitter import zh_second_title_enhance
 import langchain.document_loaders
+from langchain.document_loaders.word_document import Docx2txtLoader 
 from langchain.docstore.document import Document
 from langchain.text_splitter import TextSplitter
 from pathlib import Path
@ -20,7 +21,7 @@ from server.utils import run_in_thread_pool, get_model_worker_config
 import json
 from typing import List, Union,Dict, Tuple, Generator
 import chardet
-
+import re

 def validate_kb_name(knowledge_base_id: str) -> bool:
    # 检查是否包含预期外的字符或路径攻击关键字
@ -84,7 +85,7 @@ def list_files_from_folder(kb_name: str):

    return result

-
+#PDFPlumberLoader
 LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
               "MHTMLLoader": ['.mhtml'],
               "UnstructuredMarkdownLoader": ['.md'],
@ -105,11 +106,12 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
               "SRTLoader": ['.srt'],
               "TomlLoader": ['.toml'],
               "UnstructuredTSVLoader": ['.tsv'],
-               "UnstructuredWordDocumentLoader": ['.docx', '.doc'],
+               #"UnstructuredWordDocumentLoader": ['.docx', '.doc'],
               "UnstructuredXMLLoader": ['.xml'],
               "UnstructuredPowerPointLoader": ['.ppt', '.pptx'],
               "EverNoteLoader": ['.enex'],
               "UnstructuredFileLoader": ['.txt'],
+               "Docx2txtLoader":['.docx','.doc'],
               }
 SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]

@ -275,6 +277,11 @@ class KnowledgeFile:
        self.kb_name = knowledge_base_name
        self.filename = str(Path(filename).as_posix())
        self.ext = os.path.splitext(filename)[-1].lower()
+
+        #self.filename = filename
+        #self.ext = os.path.splitext(filename)[-1].lower()
+        self.doc_title_name, file_extension = os.path.splitext(filename)
+        #self.ext = file_extension.lower()
        if self.ext not in SUPPORTED_EXTS:
            raise ValueError(f"暂未支持的文件格式 {self.filename}")
        self.loader_kwargs = loader_kwargs
@ -283,6 +290,7 @@ class KnowledgeFile:
        self.splited_docs = None
        self.document_loader_name = get_LoaderClass(self.ext)
        self.text_splitter_name = TEXT_SPLITTER_NAME
+        print(f"KnowledgeFile: filepath:{self.filepath}")

    def file2docs(self, refresh: bool = False):
        if self.docs is None or refresh:
@ -293,6 +301,8 @@ class KnowledgeFile:
            self.docs = loader.load()
        return self.docs
        
+        print(f"KnowledgeFile: filepath:{self.filepath}, doc_title_name:{self.doc_title_name}, ext:{self.ext}")
+
    def docs2texts(
            self,
            docs: List[Document] = None,
@ -302,7 +312,21 @@ class KnowledgeFile:
            chunk_overlap: int = OVERLAP_SIZE,
            text_splitter: TextSplitter = None,
    ):
+        def customize_zh_title_enhance(docs: Document) -> Document:
+            if len(docs) > 0:
+                for doc in docs:
+                    doc.page_content = f"下文与({self.doc_title_name})有关。{doc.page_content}"
+                return docs
+            else:
+                print("文件不存在")
+
        docs = docs or self.file2docs(refresh=refresh)
+        #after loading, remove the redundant line break
+        for doc in docs:
+            if doc.page_content.strip()!="":
+                doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip()) 
+        file_name_without_extension, file_extension = os.path.splitext(self.filepath)
+        print(f"filepath:{self.filepath},文件名拆分后：{file_name_without_extension},{file_extension}")
        if not docs:
            return []
        if self.ext not in [".csv"]:
@ -312,17 +336,39 @@ class KnowledgeFile:
            if self.text_splitter_name == "MarkdownHeaderTextSplitter":
                docs = text_splitter.split_text(docs[0].page_content)
            else:
+                print(f"**********************docs2texts: text_splitter.split_documents(docs)")
+                outputfile = file_name_without_extension + "_source.txt"
+                with open(outputfile, 'w') as file:
+                    for doc in docs:
+                        file.write(doc.page_content)
                docs = text_splitter.split_documents(docs)
                
+        #print(f"文档切分示例：{docs[0]}")
+        # print(f"KnowledgeFile: filepath:{self.filepath}")
+        # file_name_without_extension, file_extension = os.path.splitext(self.filepath)
+        # print("filepath:{self.filepath},文件名拆分后：{file_name_without_extension},{file_extension}")
+
        if not docs:
            return []
                
-        print(f"文档切分示例：{docs[0]}")
        if zh_title_enhance:
-            docs = func_zh_title_enhance(docs)
+            docs = zh_second_title_enhance(docs)
+            docs = customize_zh_title_enhance(docs)
+        i = 1
+        outputfile = file_name_without_extension + "_split.txt"
+        # 打开文件以写入模式
+        with open(outputfile, 'w') as file:
+            for doc in docs:
+                print(f"**********切分段{i}：{doc}")
+                file.write(f"\n**********切分段{i}")
+                file.write(doc.page_content)
+                i = i+1
+
        self.splited_docs = docs
        return self.splited_docs
    
+
+
    def file2text(
            self,
            zh_title_enhance: bool = ZH_TITLE_ENHANCE,
--- a/test.py
+++ b/test.py
@ -0,0 +1,21 @@
+
+from server.knowledge_base.kb_service.faiss_kb_service import FaissKBService
+from server.knowledge_base import KnowledgeFile
+
+if __name__ == '__main__':
+    from pprint import pprint
+
+    #kb_file = KnowledgeFile(filename="test.txt", knowledge_base_name="samples")
+    # kb_file = KnowledgeFile(filename="国网安徽信通公司安全准入实施要求_修订.docx", knowledge_base_name="test")
+    # docs = kb_file.file2docs()
+    # pprint(docs[-1])
+    # docs = kb_file.file2text()
+    # pprint(docs[-1])
+
+    faissService = FaissKBService("test")
+    faissService.add_doc(KnowledgeFile("电力电缆故障测寻车技术规范.docx", "test"))
+    # faissService.delete_doc(KnowledgeFile("README.md", "test"))
+    # faissService.do_drop_kb()
+    #print(faissService.search_docs("准入手续的内容是什么？"))
+
+
--- a/text_splitter/init.py
+++ b/text_splitter/init.py
@ -2,3 +2,4 @@ from .chinese_text_splitter import ChineseTextSplitter
 from .ali_text_splitter import AliTextSplitter
 from .zh_title_enhance import zh_title_enhance
 from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter
+from .zh_second_title_enhance import zh_second_title_enhance
--- a/text_splitter/chinese_recursive_text_splitter.py
+++ b/text_splitter/chinese_recursive_text_splitter.py
--- a/text_splitter/chinese_recursive_text_splitter_new.py
+++ b/text_splitter/chinese_recursive_text_splitter_new.py
--- a/text_splitter/zh_second_title_enhance.py
+++ b/text_splitter/zh_second_title_enhance.py
@ -0,0 +1,113 @@
+from langchain.docstore.document import Document
+import re
+
+def get_fist_level_title(
+        text: str,
+) -> bool:
+    # 文本长度为0的话或长度大于25，肯定不是title
+    if len(text) == 0 and len (text)>= 25:
+        print("Not a title. Text is empty or longer than 25.")
+        return ""
+    
+    splitlines = text.splitlines()
+    first_line = splitlines[0]
+    # 文本中有标点符号，就不是title
+    ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
+    ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
+    if ENDS_IN_PUNCT_RE.search(first_line) is not None:
+        return ""
+  
+    FIRST_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)\d+[^\S\n]+[^\s\.]+\S+)'
+    TITLE_PUNCT_RE = re.compile(FIRST_TITLE)
+    if TITLE_PUNCT_RE.search(first_line) is not None:
+        return first_line
+    return ""
+
+#return the 2nd level title
+def get_second_level_title(
+        text: str,
+) -> str:
+    # 文本长度为0的话，肯定不是title
+    if len(text) == 0 and len (text)>= 25:
+        print("Not a title. Text is empty or longer than 25.")
+        return ""
+    
+    splitlines = text.splitlines()
+    first_line = splitlines[0]
+    # 文本中有标点符号，就不是title
+    ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
+    ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
+    if ENDS_IN_PUNCT_RE.search(first_line) is not None:
+       return ""
+           
+    Second_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
+    TITLE_PUNCT_RE = re.compile(Second_TITLE)
+    if TITLE_PUNCT_RE.search(first_line) is not None:
+        return first_line
+    else:
+        if len(splitlines)>1:
+            Second_line = splitlines[1]
+            if TITLE_PUNCT_RE.search(Second_line) is not None:
+                return Second_line
+    return ""
+
+#judge if it is 3rd level content
+def is_third_level_content(
+        text: str,
+) -> bool:
+    # 文本长度为0的话，肯定不是title
+    if len(text) == 0:
+        print("Not a title. Text is empty.")
+        return False
+    
+    splitlines = text.splitlines()
+    first_line = splitlines[0]
+   
+    Third_TITLE = r'((?<!\.|[a-zA-Z0-9]|\S)\s*[0-9]+\s*\.\s*[0-9]+\s*\.\s*[0-9]+[^\S\n]+[^\s\.]+(?!\.|[a-zA-Z0-9]))'
+    TITLE_PUNCT_RE = re.compile(Third_TITLE)
+    if TITLE_PUNCT_RE.search(first_line) is not None:
+        return True
+            
+    return False
+    
+#给三级被分开的内容 增加二级标题
+def zh_second_title_enhance(docs: Document) -> Document:
+    title = None
+    if len(docs) > 0:
+        for doc in docs:
+            second_title = get_second_level_title(doc.page_content)
+            if second_title:
+                title = second_title
+            elif title:
+                temp_third_content = is_third_level_content(doc.page_content)
+                if temp_third_content:
+                    doc.page_content = f"{title} {doc.page_content}"
+                else:
+                    title = None
+        return docs
+    else:
+        print("文件不存在")
+
+
+if __name__ == "__main__":
+    str =  """6   进出等电位
+6.1   直线塔进出等电位
+6.1.1   对于直线塔， 作业人员不得从横担或绝缘子串垂直进出等电位， 可采用吊篮（吊椅、吊梯） 法、 绝缘软梯法等方式进出等电位。
+6.1.2   等电位作业人员进出等电位时与接地体及带电体的各电气间隙距离（包括安全距离、组合间隙） 均应满足表 1 、3 要求。
+6.1.3    吊篮（吊椅、吊梯）必须用吊拉绳索稳固悬吊； 吊篮（吊椅、吊梯）的移动速度必须用绝缘滑 车组严格控制， 做到均匀、慢速； 固定吊拉绳索的长度， 应准确计算或实际丈量， 保证等电位作业人员 即将进入等电位时人体最高部位不超过导线侧均压环。
+6.2   耐张塔进出等电位
+6.2.1   在耐张塔进出等电位时，作业人员可采用沿耐张绝缘子串方法或其它方法进出等电位。
+6.2.2   等电位作业人员沿绝缘子串移动时， 手与脚的位置必须保持对应一致， 且人体和工具短接的绝 缘子片数应符合 5.2.2 的要求。
+6.2.3   等电位作业人员所系安全带，应绑在手扶的绝缘子串上，并与等电位作业人员同步移动。
+6.2.4   等电位作业人员在进出等电位时，应在移动至距离带电体 3  片绝缘子时进行电位转移，方可进 行后续操作。
+6.2.5   带电作业人员与接地体及带电体的各电气间隙距离（包括安全距离、组合间隙）和经人体或工 具短接后的良好绝缘子片数均应满足表 4 要求，否则不能沿耐张绝缘子串进出等电位。
+7   作业中的注意事项
+7.1   等电位作业人员在带电作业过程中时，应避免身体动作幅度过大。
+7.2   等电位作业人员与地电位作业人员之间传递物品应采用绝缘工具，绝缘工具的有效长度，应满足 表 2 的规定。
+7.3   屏蔽服装应无破损和孔洞， 各部分应连接良好、可靠。发现破损和毛刺时应送有资质的试验单位 进行屏蔽服装电阻和屏蔽效率测量，测量结果满足本标准 5.3.1 条的要求后，方可使用。
+7.4   绝缘工具在使用前， 应使用 2500V 及以上兆欧表进行分段检测（电极宽 2cm，极间宽 2cm），阻值 不低于 700MΩ。"""
+    title = is_third_level_content(str)
+    print(title)
+    title = get_second_level_title(str)
+    print(title)
+    #zh_second_title_enhance()
--- a/text_splitter/柔性直流系统直流断路器验收规范.pdf
+++ b/text_splitter/柔性直流系统直流断路器验收规范.pdf
--- a/text_splitter/电力电缆故障测寻车技术规范.pdf
+++ b/text_splitter/电力电缆故障测寻车技术规范.pdf
--- a/webui.py
+++ b/webui.py
@ -21,7 +21,7 @@ if __name__ == "__main__":
        menu_items={
            'Get Help': 'https://github.com/chatchat-space/Langchain-Chatchat',
            'Report a bug': "https://github.com/chatchat-space/Langchain-Chatchat/issues",
-            'About': f"""欢迎使用 Langchain-Chatchat WebUI {VERSION}！"""
+            'About': f"""欢迎使用 思极大模型 WebUI {VERSION}！"""
        }
    )

@ -40,7 +40,8 @@ if __name__ == "__main__":
        st.image(
            os.path.join(
                "img",
-                "logo-long-chatchat-trans-v2.png"
+                "siji.jpg"
+                #"logo-long-chatchat-trans-v2.png"
            ),
            use_column_width=True
        )
--- a/webui_pages/.DS_Store
+++ b/webui_pages/.DS_Store
--- a/电力电缆故障测寻车技术规范.pdf
+++ b/电力电缆故障测寻车技术规范.pdf