78 lines
2.7 KiB
Python
78 lines
2.7 KiB
Python
import os
|
||
import sys
|
||
from PIL import Image
|
||
import json
|
||
from tqdm import tqdm
|
||
|
||
|
||
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||
|
||
from datasets import load_dataset, DatasetDict
|
||
|
||
ds = load_dataset("Kangheng/refcocog",cache_dir="./rafcocog_cache")
|
||
|
||
if not isinstance(ds, DatasetDict):
|
||
sys.exit()
|
||
|
||
# 假设 ds 已经定义且包含 'test' 和 'val' 的数据
|
||
|
||
# 计算总的数据量
|
||
total_items = sum(len(ds[data_key]) for data_key in ['test', 'val']) #type:ignore
|
||
|
||
# 初始化进度条
|
||
with tqdm(total=total_items, desc="Processing items") as pbar:
|
||
# 分别处理 test 和 val 数据集,并写入不同的文件
|
||
|
||
for data_key in ['test', 'val']:
|
||
# 定义输出文件名
|
||
output_file = f'data_{data_key}.jsonl'
|
||
|
||
|
||
# 打开输出文件
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
for item in ds[data_key]:
|
||
if not isinstance(item, dict):
|
||
print("数据集解析失败.")
|
||
sys.exit()
|
||
# 创建保存图片的目录
|
||
save_dir = 'dataset'
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
|
||
# 根据数据集类型确定文件名前缀
|
||
prefix = 'test-' if data_key == 'test' else 'val-'
|
||
|
||
# 定义保存图片的路径(确保文件扩展名为.jpeg)
|
||
image_name = f'{prefix}{item["question_id"]}.jpeg' #type:ignore
|
||
save_path = os.path.join(save_dir, image_name)
|
||
|
||
# 保存图片为JPEG格式
|
||
item['image'].save(save_path, format='JPEG')
|
||
|
||
# 构造新的数据格式
|
||
new_data_format = [
|
||
{
|
||
"role": "user",
|
||
"content": [
|
||
{
|
||
"type": "image",
|
||
"image": f'dataset/{image_name}',
|
||
},
|
||
{
|
||
"type": "text",
|
||
"text": f'Please provide the bounding box for the following description: {item["question"]}',
|
||
},
|
||
],
|
||
},
|
||
{
|
||
"role": "assistant",
|
||
"content": f'<|object_ref_start|>{item["question"]}<|object_ref_end|> is located at <|box_start|>{item["bbox"]}<|box_end|>'
|
||
}
|
||
]
|
||
|
||
# 将 new_data_format 对象写入 JSON Lines 文件
|
||
f.write(json.dumps(new_data_format, ensure_ascii=False) + '\n')
|
||
|
||
# 更新进度条
|
||
pbar.update(1)
|
||
print("Data has been saved to data_test.jsonl and data_val.jsonl")
|