Merge branch 'dev' into master
This commit is contained in:
commit
6e953da07b
|
|
@ -17,6 +17,7 @@ import models.shared as shared
|
||||||
from agent import bing_search
|
from agent import bing_search
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
from textsplitter.zh_title_enhance import zh_title_enhance
|
||||||
|
|
||||||
|
|
||||||
# patch HuggingFaceEmbeddings to make it hashable
|
# patch HuggingFaceEmbeddings to make it hashable
|
||||||
|
|
@ -56,7 +57,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
|
||||||
return ret_list, [os.path.basename(p) for p in ret_list]
|
return ret_list, [os.path.basename(p) for p in ret_list]
|
||||||
|
|
||||||
|
|
||||||
def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE):
|
||||||
if filepath.lower().endswith(".md"):
|
if filepath.lower().endswith(".md"):
|
||||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
@ -79,6 +80,8 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
|
||||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||||
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
|
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
|
||||||
docs = loader.load_and_split(text_splitter=textsplitter)
|
docs = loader.load_and_split(text_splitter=textsplitter)
|
||||||
|
if using_zh_title_enhance:
|
||||||
|
docs = zh_title_enhance(docs)
|
||||||
write_check_file(filepath, docs)
|
write_check_file(filepath, docs)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -173,4 +173,9 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
|
||||||
|
|
||||||
# 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out
|
# 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out
|
||||||
# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
|
# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
|
||||||
BING_SUBSCRIPTION_KEY = ""
|
BING_SUBSCRIPTION_KEY = ""
|
||||||
|
|
||||||
|
# 是否开启中文标题加强,以及标题增强的相关配置
|
||||||
|
# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记;
|
||||||
|
# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
|
||||||
|
ZH_TITLE_ENHANCE = False
|
||||||
|
|
|
||||||
|
|
@ -33,5 +33,4 @@ numpy~=1.23.5
|
||||||
tqdm~=4.65.0
|
tqdm~=4.65.0
|
||||||
requests~=2.28.2
|
requests~=2.28.2
|
||||||
tenacity~=8.2.2
|
tenacity~=8.2.2
|
||||||
# 默认下载的charset_normalizer模块版本过高会抛出,`artially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)`
|
charset_normalizer==2.1.0
|
||||||
charset_normalizer==2.1.0
|
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
from configs.model_config import *
|
||||||
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||||
|
import nltk
|
||||||
|
from vectorstores import MyFAISS
|
||||||
|
from chains.local_doc_qa import load_file
|
||||||
|
|
||||||
|
|
||||||
|
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||||
|
"knowledge_base", "samples", "content", "test.txt")
|
||||||
|
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],
|
||||||
|
model_kwargs={'device': EMBEDDING_DEVICE})
|
||||||
|
|
||||||
|
docs = load_file(filepath, using_zh_title_enhance=True)
|
||||||
|
vector_store = MyFAISS.from_documents(docs, embeddings)
|
||||||
|
query = "指令提示技术有什么示例"
|
||||||
|
search_result = vector_store.similarity_search(query)
|
||||||
|
print(search_result)
|
||||||
|
pass
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
from .chinese_text_splitter import ChineseTextSplitter
|
from .chinese_text_splitter import ChineseTextSplitter
|
||||||
from .ali_text_splitter import AliTextSplitter
|
from .ali_text_splitter import AliTextSplitter
|
||||||
|
from .zh_title_enhance import zh_title_enhance
|
||||||
|
|
@ -0,0 +1,99 @@
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
|
||||||
|
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
|
||||||
|
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
|
||||||
|
as a title or narrative text. The ratio does not count spaces.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text
|
||||||
|
The input string to test
|
||||||
|
threshold
|
||||||
|
If the proportion of non-alpha characters exceeds this threshold, the function
|
||||||
|
returns False
|
||||||
|
"""
|
||||||
|
if len(text) == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
|
||||||
|
total_count = len([char for char in text if char.strip()])
|
||||||
|
try:
|
||||||
|
ratio = alpha_count / total_count
|
||||||
|
return ratio < threshold
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_possible_title(
|
||||||
|
text: str,
|
||||||
|
title_max_word_length: int = 20,
|
||||||
|
non_alpha_threshold: float = 0.5,
|
||||||
|
) -> bool:
|
||||||
|
"""Checks to see if the text passes all of the checks for a valid title.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text
|
||||||
|
The input text to check
|
||||||
|
title_max_word_length
|
||||||
|
The maximum number of words a title can contain
|
||||||
|
non_alpha_threshold
|
||||||
|
The minimum number of alpha characters the text needs to be considered a title
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 文本长度为0的话,肯定不是title
|
||||||
|
if len(text) == 0:
|
||||||
|
print("Not a title. Text is empty.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 文本中有标点符号,就不是title
|
||||||
|
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||||
|
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||||
|
if ENDS_IN_PUNCT_RE.search(text) is not None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 文本长度不能超过设定值,默认20
|
||||||
|
# NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
|
||||||
|
# is less expensive and actual tokenization doesn't add much value for the length check
|
||||||
|
if len(text) > title_max_word_length:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 文本中数字的占比不能太高,否则不是title
|
||||||
|
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
|
||||||
|
if text.endswith((",", ".", ",", "。")):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if text.isnumeric():
|
||||||
|
print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 开头的字符内应该有数字,默认5个字符内
|
||||||
|
if len(text) < 5:
|
||||||
|
text_5 = text
|
||||||
|
else:
|
||||||
|
text_5 = text[:5]
|
||||||
|
alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
|
||||||
|
if not alpha_in_text_5:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def zh_title_enhance(docs: Document) -> Document:
|
||||||
|
title = None
|
||||||
|
if len(docs) > 0:
|
||||||
|
for doc in docs:
|
||||||
|
if is_possible_title(doc.page_content):
|
||||||
|
doc.metadata['category'] = 'cn_Title'
|
||||||
|
title = doc.page_content
|
||||||
|
elif title:
|
||||||
|
doc.page_content = f"下文与({title})有关。{doc.page_content}"
|
||||||
|
return docs
|
||||||
|
else:
|
||||||
|
print("文件不存在")
|
||||||
|
|
@ -7,6 +7,7 @@ from langchain.docstore.document import Document
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import copy
|
import copy
|
||||||
import os
|
import os
|
||||||
|
from configs.model_config import *
|
||||||
|
|
||||||
|
|
||||||
class MyFAISS(FAISS, VectorStore):
|
class MyFAISS(FAISS, VectorStore):
|
||||||
|
|
@ -23,6 +24,9 @@ class MyFAISS(FAISS, VectorStore):
|
||||||
docstore=docstore,
|
docstore=docstore,
|
||||||
index_to_docstore_id=index_to_docstore_id,
|
index_to_docstore_id=index_to_docstore_id,
|
||||||
normalize_L2=normalize_L2)
|
normalize_L2=normalize_L2)
|
||||||
|
self.score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD
|
||||||
|
self.chunk_size = CHUNK_SIZE
|
||||||
|
self.chunk_conent = False
|
||||||
|
|
||||||
def seperate_list(self, ls: List[int]) -> List[List[int]]:
|
def seperate_list(self, ls: List[int]) -> List[List[int]]:
|
||||||
# TODO: 增加是否属于同一文档的判断
|
# TODO: 增加是否属于同一文档的判断
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ COPY . /app
|
||||||
|
|
||||||
RUN pnpm run build
|
RUN pnpm run build
|
||||||
|
|
||||||
|
FROM frontend AS final
|
||||||
|
|
||||||
COPY --from=frontend /app/dist /app/public
|
COPY --from=frontend /app/dist /app/public
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,15 +24,6 @@ export const getfilelist = (knowledge_base_id: any) => {
|
||||||
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
export const getkblist = (knowledge_base_id: any) => {
|
|
||||||
return api({
|
|
||||||
url: '/local_doc_qa/list_knowledge_base',
|
|
||||||
method: 'get',
|
|
||||||
params: {},
|
|
||||||
|
|
||||||
})
|
|
||||||
}
|
|
||||||
export const bing_search = (params: any) => {
|
export const bing_search = (params: any) => {
|
||||||
return api({
|
return api({
|
||||||
url: '/local_doc_qa/bing_search_chat',
|
url: '/local_doc_qa/bing_search_chat',
|
||||||
|
|
@ -48,13 +39,6 @@ export const deletefile = (params: any) => {
|
||||||
data: JSON.stringify(params),
|
data: JSON.stringify(params),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
export const deletekb = (params: any) => {
|
|
||||||
return api({
|
|
||||||
url: '/local_doc_qa/delete_knowledge_base',
|
|
||||||
method: 'post',
|
|
||||||
data: JSON.stringify(params),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
export const web_url = () => {
|
export const web_url = () => {
|
||||||
return window.location.origin
|
return window.location.origin
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import { NButton, NForm, NFormItem, NInput, NPopconfirm } from 'naive-ui'
|
||||||
import { onMounted, ref } from 'vue'
|
import { onMounted, ref } from 'vue'
|
||||||
import filelist from './filelist.vue'
|
import filelist from './filelist.vue'
|
||||||
import { SvgIcon } from '@/components/common'
|
import { SvgIcon } from '@/components/common'
|
||||||
import { getkblist, deletekb} from '@/api/chat'
|
import { deletefile, getfilelist } from '@/api/chat'
|
||||||
import { idStore } from '@/store/modules/knowledgebaseid/id'
|
import { idStore } from '@/store/modules/knowledgebaseid/id'
|
||||||
const items = ref<any>([])
|
const items = ref<any>([])
|
||||||
const choice = ref('')
|
const choice = ref('')
|
||||||
|
|
@ -11,7 +11,7 @@ const store = idStore()
|
||||||
|
|
||||||
onMounted(async () => {
|
onMounted(async () => {
|
||||||
choice.value = store.knowledgeid
|
choice.value = store.knowledgeid
|
||||||
const res = await getkblist({})
|
const res = await getfilelist({})
|
||||||
res.data.data.forEach((item: any) => {
|
res.data.data.forEach((item: any) => {
|
||||||
items.value.push({
|
items.value.push({
|
||||||
value: item,
|
value: item,
|
||||||
|
|
@ -52,8 +52,8 @@ const handleClick = () => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
async function handleDelete(item: any) {
|
async function handleDelete(item: any) {
|
||||||
await deletekb(item.value)
|
await deletefile(item.value)
|
||||||
const res = await getkblist({})
|
const res = await getfilelist({})
|
||||||
items.value = []
|
items.value = []
|
||||||
res.data.data.forEach((item: any) => {
|
res.data.data.forEach((item: any) => {
|
||||||
items.value.push({
|
items.value.push({
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue