From cdf8b0a92b7ac67b9a003dc92bda0635a10e8545 Mon Sep 17 00:00:00 2001 From: guanyuankai Date: Wed, 30 Jul 2025 17:32:24 +0800 Subject: [PATCH] Done. Grounding Fine-tune --- .gitignore | 2 + createDataset.py | 44 +++++ ds_zero2_no_offload.json | 29 +++ train.py | 73 ++++++-- train_grounding.py | 371 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 506 insertions(+), 13 deletions(-) create mode 100644 createDataset.py create mode 100644 ds_zero2_no_offload.json create mode 100644 train_grounding.py diff --git a/.gitignore b/.gitignore index bafc48a..d07269c 100644 --- a/.gitignore +++ b/.gitignore @@ -31,4 +31,6 @@ web_demo_streaming evaluation requirements_web_demo.txt web_demo_mm.py +output +output2 diff --git a/createDataset.py b/createDataset.py new file mode 100644 index 0000000..8412a1f --- /dev/null +++ b/createDataset.py @@ -0,0 +1,44 @@ +import json +from datasets import Dataset + +def load_and_convert_data(file_path): + """加载并转换数据""" + loaded_data = [] + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + loaded_data.append(json.loads(line)) + + # 将 loaded_data 转换为适合 Dataset 的格式 + dataset_dicts = [] + for item in loaded_data: + user_content = item[0]['content'] + assistant_content = item[1]['content'] + + # 提取图像和文本信息 + image_info = next((x for x in user_content if x['type'] == 'image'), None) + text_info = next((x for x in user_content if x['type'] == 'text'), None) + + # 构造新的字典 + dataset_entry = { + 'role': 'user', + 'image_path': image_info['image'] if image_info else None, + 'question': text_info['text'] if text_info else None, + 'assistant_answer': assistant_content + } + + dataset_dicts.append(dataset_entry) + + return dataset_dicts + +# 分别加载 test 和 val 数据集 +test_data_path = 'data_test.jsonl' +val_data_path = 'data_val.jsonl' + +test_dataset_dicts = load_and_convert_data(test_data_path) +val_dataset_dicts = load_and_convert_data(val_data_path) + +# 创建 Dataset 对象 +test_dataset = Dataset.from_list(test_dataset_dicts) +val_dataset = Dataset.from_list(val_dataset_dicts) + +print("Test and Val Datasets have been created.") diff --git a/ds_zero2_no_offload.json b/ds_zero2_no_offload.json new file mode 100644 index 0000000..bdebb07 --- /dev/null +++ b/ds_zero2_no_offload.json @@ -0,0 +1,29 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/train.py b/train.py index 4a0392e..3b6199b 100644 --- a/train.py +++ b/train.py @@ -106,33 +106,80 @@ def predict(messages, model): return output_text[0] +def load_and_convert_data(file_path): + """加载并转换数据""" + loaded_data = [] + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + loaded_data.append(json.loads(line)) + + # 将 loaded_data 转换为适合 Dataset 的格式 + dataset_dicts = [] + for item in loaded_data: + user_content = item[0]['content'] + assistant_content = item[1]['content'] + + # 提取图像和文本信息 + image_info = next((x for x in user_content if x['type'] == 'image'), None) + text_info = next((x for x in user_content if x['type'] == 'text'), None) + + # 构造新的字典 + dataset_entry = { + 'role': 'user', + 'image_path': image_info['image'] if image_info else None, + 'question': text_info['text'] if text_info else None, + 'assistant_answer': assistant_content + } + + dataset_dicts.append(dataset_entry) + + return dataset_dicts # 在modelscope上下载Qwen2-VL模型到本地目录下 # model_dir = snapshot_download("Qwen/Qwen2-VL-2B-Instruct", cache_dir="./", revision="master") - +min_pixel = 256*28*28 +max_pixel = 1280*28*28 # 使用Transformers加载模型权重 tokenizer = AutoTokenizer.from_pretrained("/home/gyk/models/Qwen2.5-VL-7B-Instruct/", use_fast=False, trust_remote_code=True) processor = AutoProcessor.from_pretrained("/home/gyk/models/Qwen2.5-VL-7B-Instruct/") - model = Qwen2_5_VLForConditionalGeneration.from_pretrained("/home/gyk/models/Qwen2.5-VL-7B-Instruct/", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True,) model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法 # 处理数据集:读取json文件 # 拆分成训练集和测试集,保存为data_vl_train.json和data_vl_test.json -train_json_path = "data_vl.json" -with open(train_json_path, 'r') as f: - data = json.load(f) - train_data = data[:-4] - test_data = data[-4:] +if True: + train_json_path = "data_vl.json" + with open(train_json_path, 'r') as f: + data = json.load(f) + train_data = data[:-4] + test_data = data[-4:] -with open("data_vl_train.json", "w") as f: - json.dump(train_data, f) + with open("data_vl_train.json", "w") as f: + json.dump(train_data, f) -with open("data_vl_test.json", "w") as f: - json.dump(test_data, f) + with open("data_vl_test.json", "w") as f: + json.dump(test_data, f) -train_ds = Dataset.from_json("data_vl_train.json") -train_dataset = train_ds.map(process_func) # type: ignore + train_ds = Dataset.from_json("data_vl_train.json") + train_dataset = train_ds.map(process_func) # type: ignore +else: + # 分别加载 test 和 val 数据集 + test_data_path = 'data_test.jsonl' + val_data_path = 'data_val.jsonl' + + test_dataset_dicts = load_and_convert_data(test_data_path) + val_dataset_dicts = load_and_convert_data(val_data_path) + + # 创建 Dataset 对象 + test_tmp__dataset = Dataset.from_list(test_dataset_dicts) + val_tmp_dataset = Dataset.from_list(val_dataset_dicts) + + + test_tmp_dataset = test_tmp__dataset.select(list(range(1000))) + val_tmp_dataset = val_tmp_dataset.select(list(range(50))) + + test_dataset = test_tmp_dataset.map(process_func, batched=True,batch_size=4) + val_dataset = val_tmp_dataset.map(process_func, batched=True, batch_size=4) # 配置LoRA config = LoraConfig( diff --git a/train_grounding.py b/train_grounding.py new file mode 100644 index 0000000..149ccd4 --- /dev/null +++ b/train_grounding.py @@ -0,0 +1,371 @@ +import os +import sys +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" +# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2" +import torch, gc +gc.collect() +torch.cuda.empty_cache() +import deepspeed +DS_CONFIG = "ds_zero2_no_offload.json" +from datasets import Dataset +from modelscope import snapshot_download, AutoTokenizer +from swanlab.integration.transformers import SwanLabCallback +from qwen_vl_utils import process_vision_info +from peft import LoraConfig, TaskType, get_peft_model, PeftModel,get_peft_model_state_dict +from transformers import ( + TrainingArguments, # type: ignore + Trainer, # type: ignore + Qwen2_5_VLForConditionalGeneration, + AutoProcessor, +) +from transformers.data.data_collator import DataCollatorForSeq2Seq +import swanlab +import json + +# device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") # 多GPU时可指定起始位置/编号 +# 检查 CUDA 是否可用 +if torch.cuda.is_available(): + # 从环境变量中获取 local_rank。DeepSpeed/PyTorch DDP 会自动设置这个变量。 + # 如果环境变量不存在(例如在非分布式模式下运行),默认为 0。 + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + + # 根据 local_rank 动态创建设备对象 + device = torch.device(f"cuda:{local_rank}") + + torch.cuda.set_device(device) + + print(f"Process with local_rank {local_rank} is using device: {device}") +else: + device = torch.device("cpu") + + +def load_and_convert_data(file_path): + """加载并转换数据""" + loaded_data = [] + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + loaded_data.append(json.loads(line)) + + # 将 loaded_data 转换为适合 Dataset 的格式 + dataset_dicts = [] + for item in loaded_data: + user_content = item[0]['content'] + assistant_content = item[1]['content'] + + # 提取图像和文本信息 + image_info = next((x for x in user_content if x['type'] == 'image'), None) + text_info = next((x for x in user_content if x['type'] == 'text'), None) + + # 构造新的字典 + dataset_entry = { + 'role': 'user', + 'image_path': image_info['image'] if image_info else None, + 'question': text_info['text'] if text_info else None, + 'assistant_answer': assistant_content + } + + dataset_dicts.append(dataset_entry) + + return dataset_dicts + + + +def process_func_batch(examples): + MAX_LENGTH = 2048 + input_ids, attention_mask, labels, pixel_values, image_grid_thw = [], [], [], [], [] + + for example in zip(examples["question"], examples["assistant_answer"], examples["image_path"]): + input_content, output_content, file_path = example + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": f"{file_path}", + "resized_height": 280, + "resized_width": 280, + }, + {"type": "text", "text": input_content}, + ], + } + ] + + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=False, # 先不填充 + return_tensors="pt", + ) + + inputs_dict = {key: value.tolist() for key, value in inputs.items()} + instruction_input_ids = inputs_dict['input_ids'][0] + instruction_attention_mask = inputs_dict['attention_mask'][0] + + response = tokenizer(f"{output_content}", add_special_tokens=False) + response_input_ids = response['input_ids'] + response_attention_mask = response['attention_mask'] + + # 计算剩余可用长度给response + remaining_length = MAX_LENGTH - len(instruction_input_ids) - 1 # 减去一个PAD token的空间 + + if remaining_length < 0: + # 如果指令部分已经超过最大长度,则需要截断指令部分 + truncation_length = len(instruction_input_ids) + remaining_length + instruction_input_ids = instruction_input_ids[:truncation_length] + instruction_attention_mask = instruction_attention_mask[:truncation_length] + remaining_length = 0 + + # 截断response部分以适应剩余空间 + current_input_ids = ( + instruction_input_ids + response_input_ids[:remaining_length] + [tokenizer.pad_token_id] + ) + + current_attention_mask = ( + instruction_attention_mask + response_attention_mask[:remaining_length] + [1] + ) + current_labels = ( + [-100] * len(instruction_input_ids) + + response_input_ids[:remaining_length] + + [tokenizer.pad_token_id] + ) + + # 填充到MAX_LENGTH + if len(current_input_ids) < MAX_LENGTH: + current_input_ids += [tokenizer.pad_token_id] * (MAX_LENGTH - len(current_input_ids)) + current_attention_mask += [0] * (MAX_LENGTH - len(current_attention_mask)) + current_labels += [-100] * (MAX_LENGTH - len(current_labels)) + + input_ids.append(current_input_ids) + attention_mask.append(current_attention_mask) + labels.append(current_labels) + pixel_values.append(inputs_dict['pixel_values']) + image_grid_thw.append(torch.tensor(inputs_dict['image_grid_thw']).squeeze(0)) + + return { + + "input_ids": torch.tensor(input_ids), + "attention_mask": torch.tensor(attention_mask), + "labels": torch.tensor(labels), + "pixel_values": torch.tensor(pixel_values), + "image_grid_thw": torch.stack(image_grid_thw) + } + +def predict(messages, model): + # 准备推理 + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(messages) + + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + device = next(model.parameters()).device + # 将所有张量移动到指定的设备上 + for key, value in inputs.items(): + inputs[key] = value.to(device) + + # 生成输出 + generated_ids = model.generate(**inputs, max_new_tokens=128) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + del inputs + + return output_text[0] + + +# 在modelscope上下载Qwen2-VL模型到本地目录下 +# model_dir = snapshot_download("Qwen/Qwen2-VL-2B-Instruct", cache_dir="./", revision="master") + +# 使用Transformers加载模型权重 +tokenizer = AutoTokenizer.from_pretrained("/home/gyk/models/Qwen2.5-VL-7B-Instruct/", use_fast=True) +min_pixels = 256*28*28 +max_pixels = 1280*28*28 +processor = AutoProcessor.from_pretrained("/home/gyk/models/Qwen2.5-VL-7B-Instruct/", min_pixels=min_pixels, max_pixels=max_pixels,use_fast=True) +device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} +model = Qwen2_5_VLForConditionalGeneration.from_pretrained("/home/gyk/models/Qwen2.5-VL-7B-Instruct/", device_map=device_map, torch_dtype=torch.bfloat16) + +model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法/ + +# 处理数据集:读取json文件 +# 分别加载 test 和 val 数据集 +test_data_path = 'data_test.jsonl' +val_data_path = 'data_val.jsonl' + +test_dataset_dicts = load_and_convert_data(test_data_path) +val_dataset_dicts = load_and_convert_data(val_data_path) + +# 创建 Dataset 对象 +test_tmp_dataset = Dataset.from_list(test_dataset_dicts) +val_tmp_dataset = Dataset.from_list(val_dataset_dicts) + +indices = list(range(1000)) +test_tmp_dataset = test_tmp_dataset.select(indices) +indices = list(range(50)) +val_tmp_dataset = val_tmp_dataset.select(indices) + +test_dataset = test_tmp_dataset.map(process_func_batch, batched=True,batch_size=4) +val_dataset = val_tmp_dataset.map(process_func_batch, batched=True, batch_size=4) + +print("Test and Val Datasets have been created.") + +# 配置LoRA +config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + inference_mode=False, # 训练模式 + r=64, # Lora 秩 + lora_alpha=16, # Lora alaph,具体作用参见 Lora 原理 + lora_dropout=0.05, # Dropout 比例 + bias="none", +) + +# 获取LoRA模型 +# 转换模型 +peft_model = get_peft_model(model, config) +peft_model.config.use_cache = False # type: ignore + + + +# 配置训练参数 +args = TrainingArguments( + output_dir="./output2/Qwen2.5-VL-7B", + per_device_train_batch_size=2, + gradient_accumulation_steps=8, + logging_steps=10, + logging_first_step=True, + num_train_epochs=4, + save_steps=50, + learning_rate=1e-4, + save_on_each_node=True, + gradient_checkpointing=True, + report_to="none", + # bf16=True, + fp16=True, + max_grad_norm=1.0, + deepspeed=DS_CONFIG +) + +# 设置SwanLab回调 +swanlab_callback = SwanLabCallback( + project="Qwen2.5-VL-finetune", + experiment_name="qwen2.5-vl-refcocog", + config={ + "model": "https://modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct", + "dataset": "https://huggingface.co/datasets/Kangheng/refcocog", + "github": "https://github.com/datawhalechina/self-llm", + "prompt": "Please provide the bounding box for the following descriptio: ", + "train_data_number": len(test_dataset), + "lora_rank": 64, + "lora_alpha": 16, + "lora_dropout": 0.1, + }, +) + +# 配置Trainer +trainer = Trainer( + model=peft_model, + args=args, + train_dataset=test_dataset, + eval_dataset=val_dataset, + data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), + callbacks=[swanlab_callback], +) + + +# 开启模型训练 +trainer.train() +trainer.save_model('./output2/Qwen2.5-VL-7B') +trainer.save_state() + + +# ====================测试模式=================== +# 配置测试参数 +val_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + inference_mode=True, # 训练模式 + r=64, # Lora 秩 + lora_alpha=16, # Lora alaph,具体作用参见 Lora 原理 + lora_dropout=0.05, # Dropout 比例 + bias="none", +) + + +# 获取测试模型 +val_peft_model = PeftModel.from_pretrained(model, model_id="./output2/Qwen2.5-VL-7B/", config=val_config) + + +# 创建一个列表来保存所有需要的信息 +results_to_save = [] + +# 同时创建test_image_list用于swanlab日志记录 +test_image_list = [] + + +for item in val_dataset: + if not isinstance (item, dict): + print("item解析错误") + sys.exit() + # 准备输入消息 + messages = [{ + "role": "user", + "content": [ + { + "type": "image", + "image": item['image_path'] + }, + { + "type": "text", + "text": item['question'] + } + ]}] + + # 获取模型预测 + response = predict(messages, val_peft_model) + messages.append({"role": "assistant", "content": f"{response}"}) + + # 打印或记录预测信息 + print(messages[-1]) + + # 添加预测结果、原始答案和图片路径到结果列表中 + results_to_save.append({ + 'image_path': item['image_path'], + 'question':item['question'], + 'original_answer': item['assistant_answer'], + 'predicted_answer': response, + }) + + # 同时添加到test_image_list用于SwanLab日志记录 + test_image_list.append(swanlab.Image(item['image_path'], caption=response)) + +# 定义保存文件的路径 +output_file_path = './predictions_results.json' + +# 将结果写入JSON文件 +with open(output_file_path, 'w', encoding='utf-8') as file: + json.dump(results_to_save, file, ensure_ascii=False, indent=4) + +print(f"Results have been saved to {output_file_path}") +swanlab.init() +# 使用SwanLab记录预测结果 +swanlab.log({"Prediction": test_image_list}) + +# 在Jupyter Notebook中运行时要停止SwanLab记录,需要调用swanlab.finish() +swanlab.finish()