From e177b0bbc839491a780ce4813a9c7da760df38f8 Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Thu, 15 Jun 2023 11:36:57 +0800
Subject: [PATCH 1/5] =?UTF-8?q?Fix=20#635:=20=E8=A7=A3=E5=86=B3vue?=
 =?UTF-8?q?=E5=89=8D=E7=AB=AF=E9=95=9C=E5=83=8F=E6=9E=84=E5=BB=BA=E5=A4=B1?=
 =?UTF-8?q?=E8=B4=A5=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 views/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/views/Dockerfile b/views/Dockerfile
index 8768d12..5014ca0 100644
--- a/views/Dockerfile
+++ b/views/Dockerfile
@@ -15,7 +15,7 @@ COPY . /app
 
 RUN pnpm run build
 
-
+FROM frontend AS final
 
 COPY --from=frontend /app/dist /app/public
 

From 409a302f9b5021c405be9a2f00d7f230583ba6a6 Mon Sep 17 00:00:00 2001
From: fxjhello <127916299+fxjhello@users.noreply.github.com>
Date: Sat, 17 Jun 2023 20:29:25 +0800
Subject: [PATCH 2/5] =?UTF-8?q?Revert=20"fix:=E5=89=8D=E7=AB=AF=E7=9F=A5?=
 =?UTF-8?q?=E8=AF=86=E5=BA=93=E8=8E=B7=E5=8F=96=E5=A4=B1=E8=B4=A5.=20?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96=E5=8F=8A=E5=88=A0=E9=99=A4?=
 =?UTF-8?q?=E6=8E=A5=E5=8F=A3=E8=87=B3knowledge=5Fbase"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 views/src/api/chat.ts                            | 16 ----------------
 .../chat/layout/sider/knowledge-base/index.vue   |  8 ++++----
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/views/src/api/chat.ts b/views/src/api/chat.ts
index faf4a36..41f9f99 100644
--- a/views/src/api/chat.ts
+++ b/views/src/api/chat.ts
@@ -24,15 +24,6 @@ export const getfilelist = (knowledge_base_id: any) => {
 
   })
 }
-
-export const getkblist = (knowledge_base_id: any) => {
-  return api({
-    url: '/local_doc_qa/list_knowledge_base',
-    method: 'get',
-    params: {},
-
-  })
-}
 export const bing_search = (params: any) => {
   return api({
     url: '/local_doc_qa/bing_search_chat',
@@ -48,13 +39,6 @@ export const deletefile = (params: any) => {
     data: JSON.stringify(params),
   })
 }
-export const deletekb = (params: any) => {
-  return api({
-    url: '/local_doc_qa/delete_knowledge_base',
-    method: 'post',
-    data: JSON.stringify(params),
-  })
-}
 export const web_url = () => {
   return window.location.origin
 }
diff --git a/views/src/views/chat/layout/sider/knowledge-base/index.vue b/views/src/views/chat/layout/sider/knowledge-base/index.vue
index 43b263e..73272e6 100644
--- a/views/src/views/chat/layout/sider/knowledge-base/index.vue
+++ b/views/src/views/chat/layout/sider/knowledge-base/index.vue
@@ -3,7 +3,7 @@ import { NButton, NForm, NFormItem, NInput, NPopconfirm } from 'naive-ui'
 import { onMounted, ref } from 'vue'
 import filelist from './filelist.vue'
 import { SvgIcon } from '@/components/common'
-import { getkblist, deletekb} from '@/api/chat'
+import { deletefile, getfilelist } from '@/api/chat'
 import { idStore } from '@/store/modules/knowledgebaseid/id'
 const items = ref<any>([])
 const choice = ref('')
@@ -11,7 +11,7 @@ const store = idStore()
 
 onMounted(async () => {
   choice.value = store.knowledgeid
-  const res = await getkblist({})
+  const res = await getfilelist({})
   res.data.data.forEach((item: any) => {
     items.value.push({
       value: item,
@@ -52,8 +52,8 @@ const handleClick = () => {
   }
 }
 async function handleDelete(item: any) {
-  await deletekb(item.value)
-  const res = await getkblist({})
+  await deletefile(item.value)
+  const res = await getfilelist({})
   items.value = []
   res.data.data.forEach((item: any) => {
     items.value.push({

From 25b46a7b9e71d42dd79f314e98eb2628fac0d95c Mon Sep 17 00:00:00 2001
From: kiddog99 <49153012+kiddog99@users.noreply.github.com>
Date: Sun, 18 Jun 2023 21:45:06 +0800
Subject: [PATCH 3/5] =?UTF-8?q?=E6=A0=87=E9=A2=98=E5=A2=9E=E5=BC=BA=20(#63?=
 =?UTF-8?q?1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add files via upload

* Update local_doc_qa.py

* Update model_config.py

* Update zh_title_enhance.py

* Add files via upload

* Update README.md

* fix bugs in MyFAISS.delete_doc

* fix:前端知识库获取失败.

* update zh_title_enhance.py

* update zh_title_enhance.py

* Update zh_title_enhance.py

* add test/textsplitter

* add test_zh_title_enhance.py

---------

Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
Co-authored-by: JZF <jiangzhifeng_jzf@163.com>
Co-authored-by: fxjhello <127916299+fxjhello@users.noreply.github.com>
---
 chains/local_doc_qa.py                     |  5 +-
 configs/model_config.py                    |  7 +-
 test/textsplitter/test_zh_title_enhance.py | 21 +++++
 textsplitter/__init__.py                   |  3 +-
 textsplitter/zh_title_enhance.py           | 99 ++++++++++++++++++++++
 vectorstores/MyFAISS.py                    |  4 +
 6 files changed, 136 insertions(+), 3 deletions(-)
 create mode 100644 test/textsplitter/test_zh_title_enhance.py
 create mode 100644 textsplitter/zh_title_enhance.py

diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py
index 79c80f7..fe70066 100644
--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@@ -17,6 +17,7 @@ import models.shared as shared
 from agent import bing_search
 from langchain.docstore.document import Document
 from functools import lru_cache
+from textsplitter.zh_title_enhance import zh_title_enhance
 
 
 # patch HuggingFaceEmbeddings to make it hashable
@@ -56,7 +57,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
     return ret_list, [os.path.basename(p) for p in ret_list]
 
 
-def load_file(filepath, sentence_size=SENTENCE_SIZE):
+def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE):
     if filepath.lower().endswith(".md"):
         loader = UnstructuredFileLoader(filepath, mode="elements")
         docs = loader.load()
@@ -79,6 +80,8 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
         loader = UnstructuredFileLoader(filepath, mode="elements")
         textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
         docs = loader.load_and_split(text_splitter=textsplitter)
+    if using_zh_title_enhance:
+        docs = zh_title_enhance(docs)
     write_check_file(filepath, docs)
     return docs
 
diff --git a/configs/model_config.py b/configs/model_config.py
index 6604fc5..4644c7f 100644
--- a/configs/model_config.py
+++ b/configs/model_config.py
@@ -173,4 +173,9 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
 
 # 此外，如果是在服务器上，报Failed to establish a new connection: [Errno 110] Connection timed out
 # 是因为服务器加了防火墙，需要联系管理员加白名单，如果公司的服务器的话，就别想了GG
-BING_SUBSCRIPTION_KEY = ""
\ No newline at end of file
+BING_SUBSCRIPTION_KEY = ""
+
+# 是否开启中文标题加强，以及标题增强的相关配置
+# 通过增加标题判断，判断哪些文本为标题，并在metadata中进行标记；
+# 然后将文本与往上一级的标题进行拼合，实现文本信息的增强。
+ZH_TITLE_ENHANCE = True
diff --git a/test/textsplitter/test_zh_title_enhance.py b/test/textsplitter/test_zh_title_enhance.py
new file mode 100644
index 0000000..def0fcc
--- /dev/null
+++ b/test/textsplitter/test_zh_title_enhance.py
@@ -0,0 +1,21 @@
+from configs.model_config import *
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+import nltk
+from vectorstores import MyFAISS
+from chains.local_doc_qa import load_file
+
+
+nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
+
+if __name__ == "__main__":
+    filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+                            "knowledge_base", "samples", "content", "test.txt")
+    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],
+                                       model_kwargs={'device': EMBEDDING_DEVICE})
+
+    docs = load_file(filepath, using_zh_title_enhance=True)
+    vector_store = MyFAISS.from_documents(docs, embeddings)
+    query = "指令提示技术有什么示例"
+    search_result = vector_store.similarity_search(query)
+    print(search_result)
+    pass
diff --git a/textsplitter/__init__.py b/textsplitter/__init__.py
index 114b93c..f059ccb 100644
--- a/textsplitter/__init__.py
+++ b/textsplitter/__init__.py
@@ -1,2 +1,3 @@
 from .chinese_text_splitter import ChineseTextSplitter
-from .ali_text_splitter import AliTextSplitter
\ No newline at end of file
+from .ali_text_splitter import AliTextSplitter
+from .zh_title_enhance import zh_title_enhance
\ No newline at end of file
diff --git a/textsplitter/zh_title_enhance.py b/textsplitter/zh_title_enhance.py
new file mode 100644
index 0000000..7f8c548
--- /dev/null
+++ b/textsplitter/zh_title_enhance.py
@@ -0,0 +1,99 @@
+from langchain.docstore.document import Document
+import re
+
+
+def under_non_alpha_ratio(text: str, threshold: float = 0.5):
+    """Checks if the proportion of non-alpha characters in the text snippet exceeds a given
+    threshold. This helps prevent text like "-----------BREAK---------" from being tagged
+    as a title or narrative text. The ratio does not count spaces.
+
+    Parameters
+    ----------
+    text
+        The input string to test
+    threshold
+        If the proportion of non-alpha characters exceeds this threshold, the function
+        returns False
+    """
+    if len(text) == 0:
+        return False
+
+    alpha_count = len([char for char in text if char.strip() and char.isalpha()])
+    total_count = len([char for char in text if char.strip()])
+    try:
+        ratio = alpha_count / total_count
+        return ratio < threshold
+    except:
+        return False
+
+
+def is_possible_title(
+        text: str,
+        title_max_word_length: int = 20,
+        non_alpha_threshold: float = 0.5,
+) -> bool:
+    """Checks to see if the text passes all of the checks for a valid title.
+
+    Parameters
+    ----------
+    text
+        The input text to check
+    title_max_word_length
+        The maximum number of words a title can contain
+    non_alpha_threshold
+        The minimum number of alpha characters the text needs to be considered a title
+    """
+
+    # 文本长度为0的话，肯定不是title
+    if len(text) == 0:
+        print("Not a title. Text is empty.")
+        return False
+
+    # 文本中有标点符号，就不是title
+    ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
+    ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
+    if ENDS_IN_PUNCT_RE.search(text) is not None:
+        return False
+
+    # 文本长度不能超过设定值，默认20
+    # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
+    # is less expensive and actual tokenization doesn't add much value for the length check
+    if len(text) > title_max_word_length:
+        return False
+
+    # 文本中数字的占比不能太高，否则不是title
+    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
+        return False
+
+    # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
+    if text.endswith((",", ".", "，", "。")):
+        return False
+
+    if text.isnumeric():
+        print(f"Not a title. Text is all numeric:\n\n{text}")  # type: ignore
+        return False
+
+    # 开头的字符内应该有数字，默认5个字符内
+    if len(text) < 5:
+        text_5 = text
+    else:
+        text_5 = text[:5]
+    alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
+    if not alpha_in_text_5:
+        return False
+
+    return True
+
+
+def zh_title_enhance(docs: Document) -> Document:
+    title = None
+    if len(docs) > 0:
+        for doc in docs:
+            if is_possible_title(doc.page_content):
+                doc.metadata['category'] = 'cn_Title'
+                title = doc.page_content
+            elif title:
+                doc.page_content = f"下文与({title})有关。{doc.page_content}"
+        return docs
+    else:
+        print("文件不存在")
diff --git a/vectorstores/MyFAISS.py b/vectorstores/MyFAISS.py
index 9e0ae17..0ca29e9 100644
--- a/vectorstores/MyFAISS.py
+++ b/vectorstores/MyFAISS.py
@@ -7,6 +7,7 @@ from langchain.docstore.document import Document
 import numpy as np
 import copy
 import os
+from configs.model_config import *
 
 
 class MyFAISS(FAISS, VectorStore):
@@ -23,6 +24,9 @@ class MyFAISS(FAISS, VectorStore):
                          docstore=docstore,
                          index_to_docstore_id=index_to_docstore_id,
                          normalize_L2=normalize_L2)
+        self.score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD
+        self.chunk_size = CHUNK_SIZE
+        self.chunk_conent = False
 
     def seperate_list(self, ls: List[int]) -> List[List[int]]:
         # TODO: 增加是否属于同一文档的判断

From f6218316e3be95e334b343fa9acc18945e1335dc Mon Sep 17 00:00:00 2001
From: imClumsyPanda <littlepanda0716@gmail.com>
Date: Sun, 18 Jun 2023 21:52:49 +0800
Subject: [PATCH 4/5] update requirements.txt and model_config.py

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b067c19..9f962dd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,5 +33,4 @@ numpy~=1.23.5
 tqdm~=4.65.0
 requests~=2.28.2
 tenacity~=8.2.2
-# 默认下载的charset_normalizer模块版本过高会抛出，`artially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)`
 charset_normalizer==2.1.0
\ No newline at end of file

From 017b34647ed8be7e3b43096b59ab79c71eac05ea Mon Sep 17 00:00:00 2001
From: imClumsyPanda <littlepanda0716@gmail.com>
Date: Sun, 18 Jun 2023 21:52:56 +0800
Subject: [PATCH 5/5] update requirements.txt and model_config.py

---
 configs/model_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/model_config.py b/configs/model_config.py
index 4644c7f..846da98 100644
--- a/configs/model_config.py
+++ b/configs/model_config.py
@@ -178,4 +178,4 @@ BING_SUBSCRIPTION_KEY = ""
 # 是否开启中文标题加强，以及标题增强的相关配置
 # 通过增加标题判断，判断哪些文本为标题，并在metadata中进行标记；
 # 然后将文本与往上一级的标题进行拼合，实现文本信息的增强。
-ZH_TITLE_ENHANCE = True
+ZH_TITLE_ENHANCE = False