Merge branch 'dev' into master

2023-06-18 21:54:53 +08:00 · 2023-06-18 21:54:53 +08:00 · 6e953da07b
parent 41d8846e13 017b34647e
commit 6e953da07b
10 changed files with 142 additions and 26 deletions
--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@ -17,6 +17,7 @@ import models.shared as shared
 from agent import bing_search
 from langchain.docstore.document import Document
 from functools import lru_cache
 from textsplitter.zh_title_enhance import zh_title_enhance
 # patch HuggingFaceEmbeddings to make it hashable
@ -56,7 +57,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
    return ret_list, [os.path.basename(p) for p in ret_list]
-def load_file(filepath, sentence_size=SENTENCE_SIZE):
+def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE):
    if filepath.lower().endswith(".md"):
        loader = UnstructuredFileLoader(filepath, mode="elements")
        docs = loader.load()
@ -79,6 +80,8 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
        loader = UnstructuredFileLoader(filepath, mode="elements")
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(text_splitter=textsplitter)
    if using_zh_title_enhance:
        docs = zh_title_enhance(docs)
    write_check_file(filepath, docs)
    return docs
--- a/configs/model_config.py
+++ b/configs/model_config.py
@ -173,4 +173,9 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
 # 此外，如果是在服务器上，报Failed to establish a new connection: [Errno 110] Connection timed out
 # 是因为服务器加了防火墙，需要联系管理员加白名单，如果公司的服务器的话，就别想了GG
-BING_SUBSCRIPTION_KEY = ""
+BING_SUBSCRIPTION_KEY = ""
 # 是否开启中文标题加强，以及标题增强的相关配置
 # 通过增加标题判断，判断哪些文本为标题，并在metadata中进行标记；
 # 然后将文本与往上一级的标题进行拼合，实现文本信息的增强。
 ZH_TITLE_ENHANCE = False
--- a/requirements.txt
+++ b/requirements.txt
@ -33,5 +33,4 @@ numpy~=1.23.5
 tqdm~=4.65.0
 requests~=2.28.2
 tenacity~=8.2.2
-# 默认下载的charset_normalizer模块版本过高会抛出，`artially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)`
+charset_normalizer==2.1.0
 charset_normalizer==2.1.0
--- a/test/textsplitter/test_zh_title_enhance.py
+++ b/test/textsplitter/test_zh_title_enhance.py
@ -0,0 +1,21 @@
 from configs.model_config import *
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 import nltk
 from vectorstores import MyFAISS
 from chains.local_doc_qa import load_file
 nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
 if __name__ == "__main__":
    filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
                            "knowledge_base", "samples", "content", "test.txt")
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],
                                       model_kwargs={'device': EMBEDDING_DEVICE})
    docs = load_file(filepath, using_zh_title_enhance=True)
    vector_store = MyFAISS.from_documents(docs, embeddings)
    query = "指令提示技术有什么示例"
    search_result = vector_store.similarity_search(query)
    print(search_result)
    pass
--- a/textsplitter/init.py
+++ b/textsplitter/init.py
@ -1,2 +1,3 @@
 from .chinese_text_splitter import ChineseTextSplitter
-from .ali_text_splitter import AliTextSplitter
+from .ali_text_splitter import AliTextSplitter
 from .zh_title_enhance import zh_title_enhance
--- a/textsplitter/zh_title_enhance.py
+++ b/textsplitter/zh_title_enhance.py
@ -0,0 +1,99 @@
 from langchain.docstore.document import Document
 import re
 def under_non_alpha_ratio(text: str, threshold: float = 0.5):
    """Checks if the proportion of non-alpha characters in the text snippet exceeds a given
    threshold. This helps prevent text like "-----------BREAK---------" from being tagged
    as a title or narrative text. The ratio does not count spaces.
    Parameters
    ----------
    text
        The input string to test
    threshold
        If the proportion of non-alpha characters exceeds this threshold, the function
        returns False
    """
    if len(text) == 0:
        return False
    alpha_count = len([char for char in text if char.strip() and char.isalpha()])
    total_count = len([char for char in text if char.strip()])
    try:
        ratio = alpha_count / total_count
        return ratio < threshold
    except:
        return False
 def is_possible_title(
        text: str,
        title_max_word_length: int = 20,
        non_alpha_threshold: float = 0.5,
 ) -> bool:
    """Checks to see if the text passes all of the checks for a valid title.
    Parameters
    ----------
    text
        The input text to check
    title_max_word_length
        The maximum number of words a title can contain
    non_alpha_threshold
        The minimum number of alpha characters the text needs to be considered a title
    """
    # 文本长度为0的话，肯定不是title
    if len(text) == 0:
        print("Not a title. Text is empty.")
        return False
    # 文本中有标点符号，就不是title
    ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
    ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
    if ENDS_IN_PUNCT_RE.search(text) is not None:
        return False
    # 文本长度不能超过设定值，默认20
    # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
    # is less expensive and actual tokenization doesn't add much value for the length check
    if len(text) > title_max_word_length:
        return False
    # 文本中数字的占比不能太高，否则不是title
    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
        return False
    # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
    if text.endswith((",", ".", "，", "。")):
        return False
    if text.isnumeric():
        print(f"Not a title. Text is all numeric:\n\n{text}")  # type: ignore
        return False
    # 开头的字符内应该有数字，默认5个字符内
    if len(text) < 5:
        text_5 = text
    else:
        text_5 = text[:5]
    alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
    if not alpha_in_text_5:
        return False
    return True
 def zh_title_enhance(docs: Document) -> Document:
    title = None
    if len(docs) > 0:
        for doc in docs:
            if is_possible_title(doc.page_content):
                doc.metadata['category'] = 'cn_Title'
                title = doc.page_content
            elif title:
                doc.page_content = f"下文与({title})有关。{doc.page_content}"
        return docs
    else:
        print("文件不存在")
--- a/vectorstores/MyFAISS.py
+++ b/vectorstores/MyFAISS.py
@ -7,6 +7,7 @@ from langchain.docstore.document import Document
 import numpy as np
 import copy
 import os
 from configs.model_config import *
 class MyFAISS(FAISS, VectorStore):
@ -23,6 +24,9 @@ class MyFAISS(FAISS, VectorStore):
                         docstore=docstore,
                         index_to_docstore_id=index_to_docstore_id,
                         normalize_L2=normalize_L2)
        self.score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD
        self.chunk_size = CHUNK_SIZE
        self.chunk_conent = False
    def seperate_list(self, ls: List[int]) -> List[List[int]]:
        # TODO: 增加是否属于同一文档的判断
--- a/views/Dockerfile
+++ b/views/Dockerfile
@ -15,7 +15,7 @@ COPY . /app
 RUN pnpm run build
-
+FROM frontend AS final
 COPY --from=frontend /app/dist /app/public
--- a/views/src/api/chat.ts
+++ b/views/src/api/chat.ts
@ -24,15 +24,6 @@ export const getfilelist = (knowledge_base_id: any) => {
  })
 }
 export const getkblist = (knowledge_base_id: any) => {
  return api({
    url: '/local_doc_qa/list_knowledge_base',
    method: 'get',
    params: {},
  })
 }
 export const bing_search = (params: any) => {
  return api({
    url: '/local_doc_qa/bing_search_chat',
@ -48,13 +39,6 @@ export const deletefile = (params: any) => {
    data: JSON.stringify(params),
  })
 }
 export const deletekb = (params: any) => {
  return api({
    url: '/local_doc_qa/delete_knowledge_base',
    method: 'post',
    data: JSON.stringify(params),
  })
 }
 export const web_url = () => {
  return window.location.origin
 }
--- a/views/src/views/chat/layout/sider/knowledge-base/index.vue
+++ b/views/src/views/chat/layout/sider/knowledge-base/index.vue
@ -3,7 +3,7 @@ import { NButton, NForm, NFormItem, NInput, NPopconfirm } from 'naive-ui'
 import { onMounted, ref } from 'vue'
 import filelist from './filelist.vue'
 import { SvgIcon } from '@/components/common'
-import { getkblist, deletekb} from '@/api/chat'
+import { deletefile, getfilelist } from '@/api/chat'
 import { idStore } from '@/store/modules/knowledgebaseid/id'
 const items = ref<any>([])
 const choice = ref('')
@ -11,7 +11,7 @@ const store = idStore()
 onMounted(async () => {
  choice.value = store.knowledgeid
-  const res = await getkblist({})
+  const res = await getfilelist({})
  res.data.data.forEach((item: any) => {
    items.value.push({
      value: item,
@ -52,8 +52,8 @@ const handleClick = () => {
  }
 }
 async function handleDelete(item: any) {
-  await deletekb(item.value)
+  await deletefile(item.value)
-  const res = await getkblist({})
+  const res = await getfilelist({})
  items.value = []
  res.data.data.forEach((item: any) => {
    items.value.push({