This repository has been archived on 2025-11-14. You can view files and clone it, but cannot push or open issues or pull requests.
qwen-vl-finetune-bonus/data_vl_process_grounding.py

78 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
from PIL import Image
import json
from tqdm import tqdm
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
from datasets import load_dataset, DatasetDict
ds = load_dataset("Kangheng/refcocog",cache_dir="./rafcocog_cache")
if not isinstance(ds, DatasetDict):
sys.exit()
# 假设 ds 已经定义且包含 'test' 和 'val' 的数据
# 计算总的数据量
total_items = sum(len(ds[data_key]) for data_key in ['test', 'val']) #type:ignore
# 初始化进度条
with tqdm(total=total_items, desc="Processing items") as pbar:
# 分别处理 test 和 val 数据集,并写入不同的文件
for data_key in ['test', 'val']:
# 定义输出文件名
output_file = f'data_{data_key}.jsonl'
# 打开输出文件
with open(output_file, 'w', encoding='utf-8') as f:
for item in ds[data_key]:
if not isinstance(item, dict):
print("数据集解析失败.")
sys.exit()
# 创建保存图片的目录
save_dir = 'dataset'
os.makedirs(save_dir, exist_ok=True)
# 根据数据集类型确定文件名前缀
prefix = 'test-' if data_key == 'test' else 'val-'
# 定义保存图片的路径(确保文件扩展名为.jpeg
image_name = f'{prefix}{item["question_id"]}.jpeg' #type:ignore
save_path = os.path.join(save_dir, image_name)
# 保存图片为JPEG格式
item['image'].save(save_path, format='JPEG')
# 构造新的数据格式
new_data_format = [
{
"role": "user",
"content": [
{
"type": "image",
"image": f'dataset/{image_name}',
},
{
"type": "text",
"text": f'Please provide the bounding box for the following description: {item["question"]}',
},
],
},
{
"role": "assistant",
"content": f'<|object_ref_start|>{item["question"]}<|object_ref_end|> is located at <|box_start|>{item["bbox"]}<|box_end|>'
}
]
# 将 new_data_format 对象写入 JSON Lines 文件
f.write(json.dumps(new_data_format, ensure_ascii=False) + '\n')
# 更新进度条
pbar.update(1)
print("Data has been saved to data_test.jsonl and data_val.jsonl")