45 lines
1.4 KiB
Python
45 lines
1.4 KiB
Python
|
|
import json
|
||
|
|
from datasets import Dataset
|
||
|
|
|
||
|
|
def load_and_convert_data(file_path):
|
||
|
|
"""加载并转换数据"""
|
||
|
|
loaded_data = []
|
||
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||
|
|
for line in file:
|
||
|
|
loaded_data.append(json.loads(line))
|
||
|
|
|
||
|
|
# 将 loaded_data 转换为适合 Dataset 的格式
|
||
|
|
dataset_dicts = []
|
||
|
|
for item in loaded_data:
|
||
|
|
user_content = item[0]['content']
|
||
|
|
assistant_content = item[1]['content']
|
||
|
|
|
||
|
|
# 提取图像和文本信息
|
||
|
|
image_info = next((x for x in user_content if x['type'] == 'image'), None)
|
||
|
|
text_info = next((x for x in user_content if x['type'] == 'text'), None)
|
||
|
|
|
||
|
|
# 构造新的字典
|
||
|
|
dataset_entry = {
|
||
|
|
'role': 'user',
|
||
|
|
'image_path': image_info['image'] if image_info else None,
|
||
|
|
'question': text_info['text'] if text_info else None,
|
||
|
|
'assistant_answer': assistant_content
|
||
|
|
}
|
||
|
|
|
||
|
|
dataset_dicts.append(dataset_entry)
|
||
|
|
|
||
|
|
return dataset_dicts
|
||
|
|
|
||
|
|
# 分别加载 test 和 val 数据集
|
||
|
|
test_data_path = 'data_test.jsonl'
|
||
|
|
val_data_path = 'data_val.jsonl'
|
||
|
|
|
||
|
|
test_dataset_dicts = load_and_convert_data(test_data_path)
|
||
|
|
val_dataset_dicts = load_and_convert_data(val_data_path)
|
||
|
|
|
||
|
|
# 创建 Dataset 对象
|
||
|
|
test_dataset = Dataset.from_list(test_dataset_dicts)
|
||
|
|
val_dataset = Dataset.from_list(val_dataset_dicts)
|
||
|
|
|
||
|
|
print("Test and Val Datasets have been created.")
|