| import json |
| import io |
| from datasets import Dataset, Features, Sequence, Value, Image |
| from PIL import Image as PILImage |
|
|
| |
| with open("Train_QA_10k_noFreeForm.json", "r") as f: |
| records = json.load(f) |
|
|
| |
| ds = Dataset.from_list(records) |
|
|
| |
| def read_image_bytes(example): |
| with open(example["path"], "rb") as img_f: |
| example["image_bytes"] = img_f.read() |
| return example |
|
|
| |
| ds = ds.map(read_image_bytes, remove_columns=[]) |
|
|
| |
| features = Features({ |
| "problem_id": Value("int64"), |
| "problem": Value("string"), |
| "data_type": Value("string"), |
| "problem_type": Value("string"), |
| "options": Sequence(Value("string")), |
| "solution": Value("string"), |
| "data_source": Value("string"), |
| |
| "answer": Value("string"), |
| "path": Value("string"), |
| "image_bytes": Value("binary"), |
| }) |
| ds = ds.cast(features) |
|
|
| |
| ds = ds.rename_column("image_bytes", "images") |
| ds = ds.cast_column("images", Image(decode=True)) |
|
|
| |
| img0 = ds[0]["images"] |
| print(img0) |
| |
|
|
| |
| ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet") |
|
|