NotoriousH2
/

gemma-3-1b-it-Math-GRPO

Model card Files Files and versions

gemma-3-1b-it-Math-GRPO / eval.py

NotoriousH2's picture

Add eval.py

013f3b2 verified about 1 month ago

history blame contribute delete

3.47 kB

	"""공통 평가 스크립트: vLLM 서버에 연결하여 HRM8K 전체 841문제 평가 (temperature=0)"""
	import os, json, re, sys, asyncio
	from langchain_openai import ChatOpenAI
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from openai import OpenAI

	MATH_SYSTEM_PROMPT = """주어진 수학 문제를 단계별로 풀고 답변을 작성하세요.
	반드시 최종 답변을 \\boxed{정수} 형식으로 마지막 줄에 출력하세요.
	예시: \\boxed{42}"""

	def extract_boxed(text):
	m = re.findall(r'\\boxed\{([^}]+)\}', text)
	return m[-1].strip() if m else None

	def normalize(a):
	if a is None: return None
	s = str(a).replace(",","").replace(" ","").strip()
	try:
	n = float(s)
	return str(int(n)) if n == int(n) else str(n)
	except: return s

	def check(pred, gt):
	p, g = normalize(pred), normalize(gt)
	return p is not None and g is not None and p == g

	async def evaluate(label="", save_path=None):
	client = OpenAI(base_url="http://localhost:8000/v1", api_key="token-abc123")
	model_name = client.models.list().data[0].id
	print(f"모델: {model_name}")

	with open("data/HRM8k_eval.json") as f:
	data = json.load(f)
	print(f"평가: {len(data)}개 (temperature=0, max_tokens=2048)")

	llm = ChatOpenAI(base_url="http://localhost:8000/v1", api_key="token-abc123",
	model=model_name, temperature=0, max_tokens=2048)
	prompt = ChatPromptTemplate([("user", "{sp}\n\n{q}")]).partial(sp=MATH_SYSTEM_PROMPT)
	chain = prompt \| llm \| StrOutputParser()
	inputs = [{"q": item["question"]} for item in data]
	results = await chain.abatch(inputs, config={"max_concurrency": 400})

	by_src = {}
	details = []
	for item, res in zip(data, results):
	s = item.get("source", "?")
	if s not in by_src: by_src[s] = {"correct": 0, "total": 0, "no_boxed": 0}
	by_src[s]["total"] += 1
	pred = extract_boxed(res)
	is_correct = False
	if pred is None:
	by_src[s]["no_boxed"] += 1
	elif check(pred, item["answer"]):
	by_src[s]["correct"] += 1
	is_correct = True
	details.append({
	"question": item["question"][:80],
	"source": s,
	"gt": str(item["answer"])[-30:] if isinstance(item["answer"], str) else str(item["answer"]),
	"pred": pred,
	"correct": is_correct,
	})

	tc = sum(v["correct"] for v in by_src.values())
	tt = sum(v["total"] for v in by_src.values())
	print(f"\n=== {label} 결과 (temperature=0) ===")
	for s in sorted(by_src):
	v = by_src[s]
	print(f" [{s.upper()}] {v['correct']}/{v['total']} ({v['correct']/v['total']*100:.1f}%) \| boxed미출력: {v['no_boxed']}")
	print(f" [전체] {tc}/{tt} ({tc/tt*100:.1f}%)")

	result_obj = {"label": label, "correct": tc, "total": tt, "accuracy": tc/tt*100, "by_source": by_src}

	if save_path:
	os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
	with open(save_path, "w") as f:
	json.dump({"result": result_obj, "details": details}, f, ensure_ascii=False, indent=2)
	print(f" 결과 저장: {save_path}")

	return result_obj

	if __name__ == "__main__":
	label = sys.argv[1] if len(sys.argv) > 1 else "eval"
	save_path = sys.argv[2] if len(sys.argv) > 2 else None
	asyncio.run(evaluate(label, save_path))