import json from datasets import Dataset def load_and_convert_data(file_path): """加载并转换数据""" loaded_data = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: loaded_data.append(json.loads(line)) # 将 loaded_data 转换为适合 Dataset 的格式 dataset_dicts = [] for item in loaded_data: user_content = item[0]['content'] assistant_content = item[1]['content'] # 提取图像和文本信息 image_info = next((x for x in user_content if x['type'] == 'image'), None) text_info = next((x for x in user_content if x['type'] == 'text'), None) # 构造新的字典 dataset_entry = { 'role': 'user', 'image_path': image_info['image'] if image_info else None, 'question': text_info['text'] if text_info else None, 'assistant_answer': assistant_content } dataset_dicts.append(dataset_entry) return dataset_dicts # 分别加载 test 和 val 数据集 test_data_path = 'data_test.jsonl' val_data_path = 'data_val.jsonl' test_dataset_dicts = load_and_convert_data(test_data_path) val_dataset_dicts = load_and_convert_data(val_data_path) # 创建 Dataset 对象 test_dataset = Dataset.from_list(test_dataset_dicts) val_dataset = Dataset.from_list(val_dataset_dicts) print("Test and Val Datasets have been created.")