diff --git a/docs/自定义关键字.md b/docs/自定义关键字.md deleted file mode 100644 index 6e9e47b..0000000 --- a/docs/自定义关键字.md +++ /dev/null @@ -1,80 +0,0 @@ -## 自定义关键字 - -### 为什么需要自定义关键字 - -在基于向量数据库和LLM进行问答对话的场景中,首先需要针对用户的提问从向量数据库中提取相关的内容片段,然后把用户提问和检索到的内容喂给LLM来生成回复。 -在把文档切片存入向量数据库和基于用户提问到向量数据库搜索相关内容时,都需要一个嵌入模型来对文本进行嵌入来得到一个固定长度的向量。例如使用m3e-base来对 -文本进行嵌入。 - -m3e-base和其他很多的embedding mode都是基于HuggingFaceEmbeddings实现的。HuggingFaceEmbeddings是基于sentence_transformers来实现的。 -这部分详情请参考本项目中的kb_cache/base.py中EmbeddingsPool类的load_embeddings()函数中的代码。 - -sentence_transformers是Sentence Bert模型的实现,使用了Bert的基于wordpiece的tokenizer. Bert tokenizer在对文本进行tokenize时的结果 -举例如下: - - 输入的文本(这里只是一个没分隔的一串字符):iphone13pro - 生成的token id序列:[101, 8210, 8679, 10538, 102] - token到token id的映射: - [CLS]->101 - iphone->8210 - ##13->8679 - ##pro->10538 - [SEP]->102 - 这里可以看到iphone13pro被tokenize成为3个token, 分别是iphone, ##13, ##pro。 [CLS]和[SEP]是自动加入的特殊token。 - - 输入的文本:中石油 - 生成的token id序列:[101, 704, 4767, 3779, 102] - token到token id的映射: - [CLS]->101 - 中->704 - 石->4767 - 油->3779 - [SEP]->102 - 这里可以看到中石油被tokenize成了中,石,油三个token. - -在上面的两个例子中,我们期望iphone13pro和中石油都被当做一个专有名词被tokenize成一个token。这样可以提高文本嵌入和搜索的精度。 -如果进一步对嵌入模型进行精调时,这些专有名词做为不可分的关键字,可以做为一个token来得到更好的嵌入表示。 - -### 如何使用 -1. 如果需要自定义关键字,首先准备一个关键字的文本文件,每一行是一个关键字。例如: - - - 文件key_words.txt: - iphone13pro - 中石油 - -2. 配置model_config.py - - EMBEDDING_KEYWORD_FILE = "keywords.txt" - EMBEDDING_MODEL_OUTPUT_PATH = "output" - -3. 运行keyword_preprocess.py,keywords文件中的每一个keyword做为一个独立的token, 整个embedding model的embedding及 - tokenizer会被更新,并被存储到配置的目录中。这个目录和原始的embedding model的目录结构是一致的。使用运行后的目录做为embedding model, - tokenize的结果如下: - - - 输入的文本(这里只是一个没分隔的一串字符):iphone13pro - 生成的token id序列:[101, 21128, 102] - token到token id的映射: - [CLS]->101 - iphone13pro->21128 - [SEP]->102 - - 输入的文本:中石油 - 生成的token id序列:[101, 21129, 102] - token到token id的映射: - [CLS]->101 - 中石油->21129 - [SEP]->102 - -4. 配置model_config.py。然后按照原来的流程运行: - + 使用第3步生成的目录做为embedding model的目录 - ```python - MODEL_PATH = { - "embed_model": { - "m3e-base": "output", - } - } - ``` - + 运行init_database.py来初始化数据库和向量数据库 - + 运行startup.py来启动程序 diff --git a/embeddings/add_embedding_keywords.py b/embeddings/add_embedding_keywords.py index b922a38..622a4ca 100644 --- a/embeddings/add_embedding_keywords.py +++ b/embeddings/add_embedding_keywords.py @@ -2,43 +2,71 @@ 该功能是为了将关键词加入到embedding模型中,以便于在embedding模型中进行关键词的embedding 该功能的实现是通过修改embedding模型的tokenizer来实现的 该功能仅仅对EMBEDDING_MODEL参数对应的的模型有效,输出后的模型保存在原本模型 -该功能的Idea由社区贡献,感谢@CharlesJu1 +感谢@CharlesJu1和@charlesyju的贡献提出了想法和最基础的PR 保存的模型的位置位于原本嵌入模型的目录下,模型的名称为原模型名称+Merge_Keywords_时间戳 ''' import sys - sys.path.append("..") -import os -from safetensors.torch import save_model -from sentence_transformers import SentenceTransformer from datetime import datetime from configs import ( MODEL_PATH, EMBEDDING_MODEL, EMBEDDING_KEYWORD_FILE, ) +import os +import torch +from safetensors.torch import save_model +from sentence_transformers import SentenceTransformer -def add_keyword_to_model(model_name: str = EMBEDDING_MODEL, keyword_file: str = "", output_model_path: str = None): +def get_keyword_embedding(bert_model, tokenizer, key_words): + tokenizer_output = tokenizer(key_words, return_tensors="pt", padding=True, truncation=True) + + # No need to manually convert to tensor as we've set return_tensors="pt" + input_ids = tokenizer_output['input_ids'] + + # Remove the first and last token for each sequence in the batch + input_ids = input_ids[:, 1:-1] + + keyword_embedding = bert_model.embeddings.word_embeddings(input_ids) + keyword_embedding = torch.mean(keyword_embedding, 1) + + return keyword_embedding + + +def add_keyword_to_model(model_name=EMBEDDING_MODEL, keyword_file: str = "", output_model_path: str = None): key_words = [] with open(keyword_file, "r") as f: for line in f: key_words.append(line.strip()) - model = SentenceTransformer(model_name) - word_embedding_model = model._first_module() + st_model = SentenceTransformer(model_name) + key_words_len = len(key_words) + word_embedding_model = st_model._first_module() + bert_model = word_embedding_model.auto_model tokenizer = word_embedding_model.tokenizer + key_words_embedding = get_keyword_embedding(bert_model, tokenizer, key_words) + # key_words_embedding = st_model.encode(key_words) + + embedding_weight = bert_model.embeddings.word_embeddings.weight + embedding_weight_len = len(embedding_weight) tokenizer.add_tokens(key_words) - word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32) + bert_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32) + + # key_words_embedding_tensor = torch.from_numpy(key_words_embedding) + embedding_weight = bert_model.embeddings.word_embeddings.weight + with torch.no_grad(): + embedding_weight[embedding_weight_len:embedding_weight_len + key_words_len, :] = key_words_embedding if output_model_path: os.makedirs(output_model_path, exist_ok=True) - tokenizer.save_pretrained(output_model_path) - model.save(output_model_path) + word_embedding_model.save(output_model_path) safetensors_file = os.path.join(output_model_path, "model.safetensors") metadata = {'format': 'pt'} - save_model(model, safetensors_file, metadata) + save_model(bert_model, safetensors_file, metadata) + print("save model to {}".format(output_model_path)) + def add_keyword_to_embedding_model(path: str = EMBEDDING_KEYWORD_FILE): keyword_file = os.path.join(path) @@ -48,8 +76,46 @@ def add_keyword_to_embedding_model(path: str = EMBEDDING_KEYWORD_FILE): output_model_name = "{}_Merge_Keywords_{}".format(EMBEDDING_MODEL, current_time) output_model_path = os.path.join(model_parent_directory, output_model_name) add_keyword_to_model(model_name, keyword_file, output_model_path) - print("save model to {}".format(output_model_path)) if __name__ == '__main__': add_keyword_to_embedding_model(EMBEDDING_KEYWORD_FILE) + + # input_model_name = "" + # output_model_path = "" + # # 以下为加入关键字前后tokenizer的测试用例对比 + # def print_token_ids(output, tokenizer, sentences): + # for idx, ids in enumerate(output['input_ids']): + # print(f'sentence={sentences[idx]}') + # print(f'ids={ids}') + # for id in ids: + # decoded_id = tokenizer.decode(id) + # print(f' {decoded_id}->{id}') + # + # sentences = [ + # '数据科学与大数据技术', + # 'Langchain-Chatchat' + # ] + # + # st_no_keywords = SentenceTransformer(input_model_name) + # tokenizer_without_keywords = st_no_keywords.tokenizer + # print("===== tokenizer with no keywords added =====") + # output = tokenizer_without_keywords(sentences) + # print_token_ids(output, tokenizer_without_keywords, sentences) + # print(f'-------- embedding with no keywords added -----') + # embeddings = st_no_keywords.encode(sentences) + # print(embeddings) + # + # print("--------------------------------------------") + # print("--------------------------------------------") + # print("--------------------------------------------") + # + # st_with_keywords = SentenceTransformer(output_model_path) + # tokenizer_with_keywords = st_with_keywords.tokenizer + # print("===== tokenizer with keyword added =====") + # output = tokenizer_with_keywords(sentences) + # print_token_ids(output, tokenizer_with_keywords, sentences) + # + # print(f'-------- embedding with keywords added -----') + # embeddings = st_with_keywords.encode(sentences) + # print(embeddings) \ No newline at end of file diff --git a/keywords_preprocess.py b/keywords_preprocess.py deleted file mode 100644 index 0d94258..0000000 --- a/keywords_preprocess.py +++ /dev/null @@ -1,106 +0,0 @@ -import os - -import torch -from safetensors.torch import save_model -from sentence_transformers import SentenceTransformer - -def get_keyword_embedding(bert_model, tokenizer, key_words): - tokenizer_output = tokenizer(key_words) - input_ids = torch.tensor(tokenizer_output['input_ids'])[:, 1:-1] - keyword_embedding = bert_model.embeddings.word_embeddings(input_ids) - keyword_embedding = torch.mean(keyword_embedding, 1) - return keyword_embedding - - -def add_keyword_to_model(model_name, key_words, output_model_path): - st_model = SentenceTransformer(model_name) - key_words_len = len(key_words) - word_embedding_model = st_model._first_module() - bert_model = word_embedding_model.auto_model - tokenizer = word_embedding_model.tokenizer - key_words_embedding = get_keyword_embedding(bert_model, tokenizer, key_words) - # key_words_embedding = st_model.encode(key_words) - - embedding_weight = bert_model.embeddings.word_embeddings.weight - embedding_weight_len = len(embedding_weight) - tokenizer.add_tokens(key_words) - bert_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32) - - # key_words_embedding_tensor = torch.from_numpy(key_words_embedding) - embedding_weight = bert_model.embeddings.word_embeddings.weight - with torch.no_grad(): - embedding_weight[embedding_weight_len:embedding_weight_len+key_words_len, :] = key_words_embedding - - if output_model_path: - os.makedirs(output_model_path, exist_ok=True) - word_embedding_model.save(output_model_path) - safetensors_file = os.path.join(output_model_path, "model.safetensors") - metadata = {'format': 'pt'} - save_model(bert_model, safetensors_file, metadata) - -def add_keyword_file_to_model(model_name, keyword_file, output_model_path): - key_words = [] - with open(keyword_file, "r") as f: - for line in f: - key_words.append(line.strip()) - add_keyword_to_model(model_name, key_words, output_model_path) - - -if __name__ == '__main__': - from configs import ( - MODEL_PATH, - EMBEDDING_MODEL, - EMBEDDING_KEYWORD_FILE, - EMBEDDING_MODEL_OUTPUT_PATH - ) - keyword_file = EMBEDDING_KEYWORD_FILE - model_name = MODEL_PATH["embed_model"][EMBEDDING_MODEL] - output_model_path = EMBEDDING_MODEL_OUTPUT_PATH - - add_keyword_file_to_model(model_name, keyword_file, output_model_path) - - # 以下为加入关键字前后tokenizer的测试用例对比 - def print_token_ids(output, tokenizer, sentences): - for idx, ids in enumerate(output['input_ids']): - print(f'sentence={sentences[idx]}') - print(f'ids={ids}') - for id in ids: - decoded_id = tokenizer.decode(id) - print(f' {decoded_id}->{id}') - - # sentences = [ - # '任务中国', - # '中石油', - # '指令提示技术' - # 'Apple Watch Series 3 is good', - # 'Apple Watch Series 8 is good', - # 'Apple Watch Series is good', - # 'Apple Watch is good', - # 'iphone 13pro'] - sentences = [ - '指令提示技术', - 'Apple Watch Series 3' - ] - - st_no_keywords = SentenceTransformer(model_name) - tokenizer_without_keywords = st_no_keywords.tokenizer - print("===== tokenizer with no keywords added =====") - output = tokenizer_without_keywords(sentences) - print_token_ids(output, tokenizer_without_keywords, sentences) - print(f'-------- embedding with no keywords added -----') - embeddings = st_no_keywords.encode(sentences) - print(embeddings) - - st_with_keywords = SentenceTransformer(output_model_path) - tokenizer_with_keywords = st_with_keywords.tokenizer - print("===== tokenizer with keyword added =====") - output = tokenizer_with_keywords(sentences) - print_token_ids(output, tokenizer_with_keywords, sentences) - - print(f'-------- embedding with keywords added -----') - embeddings = st_with_keywords.encode(sentences) - print(embeddings) - - - -