| |
| """ |
| Convert math_vision data_all.json to mm_math format and split into train/test sets |
| """ |
| import json |
| import random |
| from pathlib import Path |
|
|
| def convert_to_mm_math_format(item): |
| """ |
| Convert math_vision format to mm_math format |
| |
| Math vision format: |
| { |
| "id": "1", |
| "question": "Which number should be written in place of the question mark?\n<image1>", |
| "options": [], |
| "image": "images/1.jpg", |
| "answer": "60", |
| "solution": null, |
| "level": 2, |
| "subject": "arithmetic" |
| } |
| |
| MM Math format: |
| { |
| "solution": "\\boxed{28^\\circ}", |
| "prompt": "Solve the problem and output the answer in the format of \\boxed{your answer}.\\n Question: ... \\n Options: ...", |
| "completion": "answer", |
| "image_path": "dataset/mm_math/images/MM_Math/52076087.png" |
| } |
| """ |
| |
| answer = item.get('answer', '') |
| solution = f"\\boxed{{{answer}}}" |
| |
| |
| question = item.get('question', '') |
| options = item.get('options', []) |
| |
| |
| prompt = f"Solve the problem and output the answer in the format of \\boxed{{your answer}}.\\n Question: {question}" |
| |
| |
| if options and len(options) > 0: |
| options_str = ", ".join(options) |
| prompt += f"\\n Options: {options_str}" |
| |
| |
| completion = f"\\boxed{{{answer}}}" |
| |
| |
| image = item.get('image', '') |
| if image: |
| |
| image_path = f"dataset/math_vision/{image}" |
| else: |
| image_path = "" |
| |
| return { |
| "solution": solution, |
| "prompt": prompt, |
| "completion": completion, |
| "image_path": image_path |
| } |
|
|
| def main(): |
| |
| random.seed(42) |
| |
| |
| input_file = Path("/root/CVPR/MemGen/data/math_vision/data_all.json") |
| print(f"Reading {input_file}...") |
| |
| with open(input_file, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| |
| print(f"Total samples: {len(data)}") |
| |
| |
| random.shuffle(data) |
| |
| |
| split_ratio = 0.8 |
| split_index = int(len(data) * split_ratio) |
| |
| train_data = data[:split_index] |
| test_data = data[split_index:] |
| |
| print(f"Train samples: {len(train_data)}") |
| print(f"Test samples: {len(test_data)}") |
| |
| |
| print("\nConverting train data to mm_math format...") |
| train_converted = [convert_to_mm_math_format(item) for item in train_data] |
| |
| print("Converting test data to mm_math format...") |
| test_converted = [convert_to_mm_math_format(item) for item in test_data] |
| |
| |
| output_dir = Path("/root/CVPR/MemGen/data/math_vision") |
| |
| train_output = output_dir / "train.json" |
| test_output = output_dir / "test.json" |
| |
| print(f"\nSaving train data to {train_output}...") |
| with open(train_output, 'w', encoding='utf-8') as f: |
| json.dump(train_converted, f, ensure_ascii=False, indent=2) |
| |
| print(f"Saving test data to {test_output}...") |
| with open(test_output, 'w', encoding='utf-8') as f: |
| json.dump(test_converted, f, ensure_ascii=False, indent=2) |
| |
| print("\nConversion complete!") |
| print(f"Train set: {len(train_converted)} samples -> {train_output}") |
| print(f"Test set: {len(test_converted)} samples -> {test_output}") |
| |
| |
| print("\n" + "="*80) |
| print("Sample converted train data:") |
| print("="*80) |
| print(json.dumps(train_converted[0], ensure_ascii=False, indent=2)) |
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|