This repository has been archived on 2025-11-14. You can view files and clone it, but cannot push or open issues or pull requests.
qwen-vl-finetune-bonus/createDataset.py

45 lines
1.4 KiB
Python

import json
from datasets import Dataset
def load_and_convert_data(file_path):
"""加载并转换数据"""
loaded_data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
loaded_data.append(json.loads(line))
# 将 loaded_data 转换为适合 Dataset 的格式
dataset_dicts = []
for item in loaded_data:
user_content = item[0]['content']
assistant_content = item[1]['content']
# 提取图像和文本信息
image_info = next((x for x in user_content if x['type'] == 'image'), None)
text_info = next((x for x in user_content if x['type'] == 'text'), None)
# 构造新的字典
dataset_entry = {
'role': 'user',
'image_path': image_info['image'] if image_info else None,
'question': text_info['text'] if text_info else None,
'assistant_answer': assistant_content
}
dataset_dicts.append(dataset_entry)
return dataset_dicts
# 分别加载 test 和 val 数据集
test_data_path = 'data_test.jsonl'
val_data_path = 'data_val.jsonl'
test_dataset_dicts = load_and_convert_data(test_data_path)
val_dataset_dicts = load_and_convert_data(val_data_path)
# 创建 Dataset 对象
test_dataset = Dataset.from_list(test_dataset_dicts)
val_dataset = Dataset.from_list(val_dataset_dicts)
print("Test and Val Datasets have been created.")