LCZZZZ
/

model111

Model card Files Files and versions

model111 / scripts /convert_math_vision_format.py

LCZZZZ's picture

Upload MemGen code and data

e34b94f verified 6 months ago

history blame contribute delete

3.93 kB

	#!/usr/bin/env python3
	"""
	Convert math_vision data_all.json to mm_math format and split into train/test sets
	"""
	import json
	import random
	from pathlib import Path

	def convert_to_mm_math_format(item):
	"""
	Convert math_vision format to mm_math format

	Math vision format:
	{
	"id": "1",
	"question": "Which number should be written in place of the question mark?\n<image1>",
	"options": [],
	"image": "images/1.jpg",
	"answer": "60",
	"solution": null,
	"level": 2,
	"subject": "arithmetic"
	}

	MM Math format:
	{
	"solution": "\\boxed{28^\\circ}",
	"prompt": "Solve the problem and output the answer in the format of \\boxed{your answer}.\\n Question: ... \\n Options: ...",
	"completion": "answer",
	"image_path": "dataset/mm_math/images/MM_Math/52076087.png"
	}
	"""
	# Format the answer in boxed format
	answer = item.get('answer', '')
	solution = f"\\boxed{{{answer}}}"

	# Format the prompt with question and options
	question = item.get('question', '')
	options = item.get('options', [])

	# Build the prompt
	prompt = f"Solve the problem and output the answer in the format of \\boxed{{your answer}}.\\n Question: {question}"

	# Add options if they exist
	if options and len(options) > 0:
	options_str = ", ".join(options)
	prompt += f"\\n Options: {options_str}"

	# Completion is also in boxed format (same as solution)
	completion = f"\\boxed{{{answer}}}"

	# Format the image path
	image = item.get('image', '')
	if image:
	# Convert from "images/1.jpg" to "dataset/math_vision/images/1.jpg"
	image_path = f"dataset/math_vision/{image}"
	else:
	image_path = ""

	return {
	"solution": solution,
	"prompt": prompt,
	"completion": completion,
	"image_path": image_path
	}

	def main():
	# Set random seed for reproducibility
	random.seed(42)

	# Read data_all.json
	input_file = Path("/root/CVPR/MemGen/data/math_vision/data_all.json")
	print(f"Reading {input_file}...")

	with open(input_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	print(f"Total samples: {len(data)}")

	# Shuffle the data
	random.shuffle(data)

	# Split into train (80%) and test (20%)
	split_ratio = 0.8
	split_index = int(len(data) * split_ratio)

	train_data = data[:split_index]
	test_data = data[split_index:]

	print(f"Train samples: {len(train_data)}")
	print(f"Test samples: {len(test_data)}")

	# Convert to mm_math format
	print("\nConverting train data to mm_math format...")
	train_converted = [convert_to_mm_math_format(item) for item in train_data]

	print("Converting test data to mm_math format...")
	test_converted = [convert_to_mm_math_format(item) for item in test_data]

	# Save the converted data
	output_dir = Path("/root/CVPR/MemGen/data/math_vision")

	train_output = output_dir / "train.json"
	test_output = output_dir / "test.json"

	print(f"\nSaving train data to {train_output}...")
	with open(train_output, 'w', encoding='utf-8') as f:
	json.dump(train_converted, f, ensure_ascii=False, indent=2)

	print(f"Saving test data to {test_output}...")
	with open(test_output, 'w', encoding='utf-8') as f:
	json.dump(test_converted, f, ensure_ascii=False, indent=2)

	print("\nConversion complete!")
	print(f"Train set: {len(train_converted)} samples -> {train_output}")
	print(f"Test set: {len(test_converted)} samples -> {test_output}")

	# Show sample converted data
	print("\n" + "="*80)
	print("Sample converted train data:")
	print("="*80)
	print(json.dumps(train_converted[0], ensure_ascii=False, indent=2))

	if __name__ == "__main__":
	main()