BryanW
/

UPO

Model card Files Files and versions

UPO / preprocess /df.py

BryanW's picture

Add files using upload-large-folder tool

29787d6 verified 4 days ago

history blame contribute delete

3.07 kB

	#!/usr/bin/env python3
	import argparse
	import sys
	from pathlib import Path

	# Avoid local helper files such as preprocess/copy.py shadowing stdlib modules
	# when this script is executed as `python /abs/path/preprocess/df.py`.
	SCRIPT_DIR = str(Path(__file__).resolve().parent)
	if SCRIPT_DIR in sys.path:
	sys.path.remove(SCRIPT_DIR)

	import numpy as np
	import pandas as pd


	def parse_args():
	parser = argparse.ArgumentParser(
	description="Recompute relative_score from scalar_score/normalized_score with a configurable percentile."
	)
	parser.add_argument(
	"--input_csv",
	default="train_processed_v2.csv",
	help="Input CSV. Can be either the raw processed CSV or an existing *_with_scores.csv file.",
	)
	parser.add_argument(
	"--output_csv",
	default="train_processed_with_scores.csv",
	help="Output CSV path.",
	)
	parser.add_argument(
	"--percentile",
	type=float,
	default=50,
	help="Percentile used as the global threshold for relative_score.",
	)
	return parser.parse_args()


	def main():
	args = parse_args()

	input_csv = Path(args.input_csv)
	output_csv = Path(args.output_csv)

	df = pd.read_csv(input_csv)

	# Support both legacy column names and the newer standardized ones.
	df = df.rename(
	columns={
	"file_path": "image_path",
	"caption": "prompt",
	}
	)

	required_columns = {"image_path", "prompt", "scalar_score"}
	missing = required_columns - set(df.columns)
	if missing:
	raise ValueError(f"Missing required columns in {input_csv}: {sorted(missing)}")

	# Recompute normalized_score if it is absent so the script works from either source CSV.
	if "normalized_score" not in df.columns:
	min_score = df["scalar_score"].min()
	max_score = df["scalar_score"].max()

	if min_score == max_score:
	df["normalized_score"] = 0.0
	else:
	df["normalized_score"] = -5 + (df["scalar_score"] - min_score) * (10.0 / (max_score - min_score))

	threshold = np.percentile(df["normalized_score"].values, args.percentile)
	print(f"Global score threshold (percentile {args.percentile:g}): {threshold:.6f}")

	df["relative_score"] = df["normalized_score"] - threshold

	if "kto_label" not in df.columns:
	df["kto_label"] = np.where(df["relative_score"] > 0, "exclusive_win", "exclusive_lose")

	new_columns = [
	"image_path",
	"prompt",
	"scalar_score",
	"normalized_score",
	"relative_score",
	"kto_label",
	]
	df = df[new_columns]

	pos_count = int((df["relative_score"] > 0).sum())
	non_pos_count = int((df["relative_score"] <= 0).sum())
	print(f"Positive samples (relative_score > 0): {pos_count}")
	print(f"Non-positive samples: {non_pos_count}")

	output_csv.parent.mkdir(parents=True, exist_ok=True)
	df.to_csv(output_csv, index=False)
	print(f"Saved to {output_csv} with {len(df)} rows.")


	if __name__ == "__main__":
	main()