From e177b0bbc839491a780ce4813a9c7da760df38f8 Mon Sep 17 00:00:00 2001 From: hzhaoy Date: Thu, 15 Jun 2023 11:36:57 +0800 Subject: [PATCH 1/5] =?UTF-8?q?Fix=20#635:=20=E8=A7=A3=E5=86=B3vue?= =?UTF-8?q?=E5=89=8D=E7=AB=AF=E9=95=9C=E5=83=8F=E6=9E=84=E5=BB=BA=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- views/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/views/Dockerfile b/views/Dockerfile index 8768d12..5014ca0 100644 --- a/views/Dockerfile +++ b/views/Dockerfile @@ -15,7 +15,7 @@ COPY . /app RUN pnpm run build - +FROM frontend AS final COPY --from=frontend /app/dist /app/public From 409a302f9b5021c405be9a2f00d7f230583ba6a6 Mon Sep 17 00:00:00 2001 From: fxjhello <127916299+fxjhello@users.noreply.github.com> Date: Sat, 17 Jun 2023 20:29:25 +0800 Subject: [PATCH 2/5] =?UTF-8?q?Revert=20"fix:=E5=89=8D=E7=AB=AF=E7=9F=A5?= =?UTF-8?q?=E8=AF=86=E5=BA=93=E8=8E=B7=E5=8F=96=E5=A4=B1=E8=B4=A5.=20?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96=E5=8F=8A=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3=E8=87=B3knowledge=5Fbase"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- views/src/api/chat.ts | 16 ---------------- .../chat/layout/sider/knowledge-base/index.vue | 8 ++++---- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/views/src/api/chat.ts b/views/src/api/chat.ts index faf4a36..41f9f99 100644 --- a/views/src/api/chat.ts +++ b/views/src/api/chat.ts @@ -24,15 +24,6 @@ export const getfilelist = (knowledge_base_id: any) => { }) } - -export const getkblist = (knowledge_base_id: any) => { - return api({ - url: '/local_doc_qa/list_knowledge_base', - method: 'get', - params: {}, - - }) -} export const bing_search = (params: any) => { return api({ url: '/local_doc_qa/bing_search_chat', @@ -48,13 +39,6 @@ export const deletefile = (params: any) => { data: JSON.stringify(params), }) } -export const deletekb = (params: any) => { - return api({ - url: '/local_doc_qa/delete_knowledge_base', - method: 'post', - data: JSON.stringify(params), - }) -} export const web_url = () => { return window.location.origin } diff --git a/views/src/views/chat/layout/sider/knowledge-base/index.vue b/views/src/views/chat/layout/sider/knowledge-base/index.vue index 43b263e..73272e6 100644 --- a/views/src/views/chat/layout/sider/knowledge-base/index.vue +++ b/views/src/views/chat/layout/sider/knowledge-base/index.vue @@ -3,7 +3,7 @@ import { NButton, NForm, NFormItem, NInput, NPopconfirm } from 'naive-ui' import { onMounted, ref } from 'vue' import filelist from './filelist.vue' import { SvgIcon } from '@/components/common' -import { getkblist, deletekb} from '@/api/chat' +import { deletefile, getfilelist } from '@/api/chat' import { idStore } from '@/store/modules/knowledgebaseid/id' const items = ref([]) const choice = ref('') @@ -11,7 +11,7 @@ const store = idStore() onMounted(async () => { choice.value = store.knowledgeid - const res = await getkblist({}) + const res = await getfilelist({}) res.data.data.forEach((item: any) => { items.value.push({ value: item, @@ -52,8 +52,8 @@ const handleClick = () => { } } async function handleDelete(item: any) { - await deletekb(item.value) - const res = await getkblist({}) + await deletefile(item.value) + const res = await getfilelist({}) items.value = [] res.data.data.forEach((item: any) => { items.value.push({ From 25b46a7b9e71d42dd79f314e98eb2628fac0d95c Mon Sep 17 00:00:00 2001 From: kiddog99 <49153012+kiddog99@users.noreply.github.com> Date: Sun, 18 Jun 2023 21:45:06 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E6=A0=87=E9=A2=98=E5=A2=9E=E5=BC=BA=20(#63?= =?UTF-8?q?1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add files via upload * Update local_doc_qa.py * Update model_config.py * Update zh_title_enhance.py * Add files via upload * Update README.md * fix bugs in MyFAISS.delete_doc * fix:前端知识库获取失败. * update zh_title_enhance.py * update zh_title_enhance.py * Update zh_title_enhance.py * add test/textsplitter * add test_zh_title_enhance.py --------- Co-authored-by: imClumsyPanda Co-authored-by: JZF Co-authored-by: fxjhello <127916299+fxjhello@users.noreply.github.com> --- chains/local_doc_qa.py | 5 +- configs/model_config.py | 7 +- test/textsplitter/test_zh_title_enhance.py | 21 +++++ textsplitter/__init__.py | 3 +- textsplitter/zh_title_enhance.py | 99 ++++++++++++++++++++++ vectorstores/MyFAISS.py | 4 + 6 files changed, 136 insertions(+), 3 deletions(-) create mode 100644 test/textsplitter/test_zh_title_enhance.py create mode 100644 textsplitter/zh_title_enhance.py diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py index 79c80f7..fe70066 100644 --- a/chains/local_doc_qa.py +++ b/chains/local_doc_qa.py @@ -17,6 +17,7 @@ import models.shared as shared from agent import bing_search from langchain.docstore.document import Document from functools import lru_cache +from textsplitter.zh_title_enhance import zh_title_enhance # patch HuggingFaceEmbeddings to make it hashable @@ -56,7 +57,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None): return ret_list, [os.path.basename(p) for p in ret_list] -def load_file(filepath, sentence_size=SENTENCE_SIZE): +def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE): if filepath.lower().endswith(".md"): loader = UnstructuredFileLoader(filepath, mode="elements") docs = loader.load() @@ -79,6 +80,8 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE): loader = UnstructuredFileLoader(filepath, mode="elements") textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) docs = loader.load_and_split(text_splitter=textsplitter) + if using_zh_title_enhance: + docs = zh_title_enhance(docs) write_check_file(filepath, docs) return docs diff --git a/configs/model_config.py b/configs/model_config.py index 6604fc5..4644c7f 100644 --- a/configs/model_config.py +++ b/configs/model_config.py @@ -173,4 +173,9 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search" # 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out # 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG -BING_SUBSCRIPTION_KEY = "" \ No newline at end of file +BING_SUBSCRIPTION_KEY = "" + +# 是否开启中文标题加强,以及标题增强的相关配置 +# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记; +# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。 +ZH_TITLE_ENHANCE = True diff --git a/test/textsplitter/test_zh_title_enhance.py b/test/textsplitter/test_zh_title_enhance.py new file mode 100644 index 0000000..def0fcc --- /dev/null +++ b/test/textsplitter/test_zh_title_enhance.py @@ -0,0 +1,21 @@ +from configs.model_config import * +from langchain.embeddings.huggingface import HuggingFaceEmbeddings +import nltk +from vectorstores import MyFAISS +from chains.local_doc_qa import load_file + + +nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path + +if __name__ == "__main__": + filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + "knowledge_base", "samples", "content", "test.txt") + embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL], + model_kwargs={'device': EMBEDDING_DEVICE}) + + docs = load_file(filepath, using_zh_title_enhance=True) + vector_store = MyFAISS.from_documents(docs, embeddings) + query = "指令提示技术有什么示例" + search_result = vector_store.similarity_search(query) + print(search_result) + pass diff --git a/textsplitter/__init__.py b/textsplitter/__init__.py index 114b93c..f059ccb 100644 --- a/textsplitter/__init__.py +++ b/textsplitter/__init__.py @@ -1,2 +1,3 @@ from .chinese_text_splitter import ChineseTextSplitter -from .ali_text_splitter import AliTextSplitter \ No newline at end of file +from .ali_text_splitter import AliTextSplitter +from .zh_title_enhance import zh_title_enhance \ No newline at end of file diff --git a/textsplitter/zh_title_enhance.py b/textsplitter/zh_title_enhance.py new file mode 100644 index 0000000..7f8c548 --- /dev/null +++ b/textsplitter/zh_title_enhance.py @@ -0,0 +1,99 @@ +from langchain.docstore.document import Document +import re + + +def under_non_alpha_ratio(text: str, threshold: float = 0.5): + """Checks if the proportion of non-alpha characters in the text snippet exceeds a given + threshold. This helps prevent text like "-----------BREAK---------" from being tagged + as a title or narrative text. The ratio does not count spaces. + + Parameters + ---------- + text + The input string to test + threshold + If the proportion of non-alpha characters exceeds this threshold, the function + returns False + """ + if len(text) == 0: + return False + + alpha_count = len([char for char in text if char.strip() and char.isalpha()]) + total_count = len([char for char in text if char.strip()]) + try: + ratio = alpha_count / total_count + return ratio < threshold + except: + return False + + +def is_possible_title( + text: str, + title_max_word_length: int = 20, + non_alpha_threshold: float = 0.5, +) -> bool: + """Checks to see if the text passes all of the checks for a valid title. + + Parameters + ---------- + text + The input text to check + title_max_word_length + The maximum number of words a title can contain + non_alpha_threshold + The minimum number of alpha characters the text needs to be considered a title + """ + + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty.") + return False + + # 文本中有标点符号,就不是title + ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + if ENDS_IN_PUNCT_RE.search(text) is not None: + return False + + # 文本长度不能超过设定值,默认20 + # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it + # is less expensive and actual tokenization doesn't add much value for the length check + if len(text) > title_max_word_length: + return False + + # 文本中数字的占比不能太高,否则不是title + if under_non_alpha_ratio(text, threshold=non_alpha_threshold): + return False + + # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles + if text.endswith((",", ".", ",", "。")): + return False + + if text.isnumeric(): + print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore + return False + + # 开头的字符内应该有数字,默认5个字符内 + if len(text) < 5: + text_5 = text + else: + text_5 = text[:5] + alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5)))) + if not alpha_in_text_5: + return False + + return True + + +def zh_title_enhance(docs: Document) -> Document: + title = None + if len(docs) > 0: + for doc in docs: + if is_possible_title(doc.page_content): + doc.metadata['category'] = 'cn_Title' + title = doc.page_content + elif title: + doc.page_content = f"下文与({title})有关。{doc.page_content}" + return docs + else: + print("文件不存在") diff --git a/vectorstores/MyFAISS.py b/vectorstores/MyFAISS.py index 9e0ae17..0ca29e9 100644 --- a/vectorstores/MyFAISS.py +++ b/vectorstores/MyFAISS.py @@ -7,6 +7,7 @@ from langchain.docstore.document import Document import numpy as np import copy import os +from configs.model_config import * class MyFAISS(FAISS, VectorStore): @@ -23,6 +24,9 @@ class MyFAISS(FAISS, VectorStore): docstore=docstore, index_to_docstore_id=index_to_docstore_id, normalize_L2=normalize_L2) + self.score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD + self.chunk_size = CHUNK_SIZE + self.chunk_conent = False def seperate_list(self, ls: List[int]) -> List[List[int]]: # TODO: 增加是否属于同一文档的判断 From f6218316e3be95e334b343fa9acc18945e1335dc Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Sun, 18 Jun 2023 21:52:49 +0800 Subject: [PATCH 4/5] update requirements.txt and model_config.py --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b067c19..9f962dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,5 +33,4 @@ numpy~=1.23.5 tqdm~=4.65.0 requests~=2.28.2 tenacity~=8.2.2 -# 默认下载的charset_normalizer模块版本过高会抛出,`artially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)` charset_normalizer==2.1.0 \ No newline at end of file From 017b34647ed8be7e3b43096b59ab79c71eac05ea Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Sun, 18 Jun 2023 21:52:56 +0800 Subject: [PATCH 5/5] update requirements.txt and model_config.py --- configs/model_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/model_config.py b/configs/model_config.py index 4644c7f..846da98 100644 --- a/configs/model_config.py +++ b/configs/model_config.py @@ -178,4 +178,4 @@ BING_SUBSCRIPTION_KEY = "" # 是否开启中文标题加强,以及标题增强的相关配置 # 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记; # 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。 -ZH_TITLE_ENHANCE = True +ZH_TITLE_ENHANCE = False