#!/usr/bin/env python3 import argparse import sys from pathlib import Path # Avoid local helper files such as preprocess/copy.py shadowing stdlib modules # when this script is executed as `python /abs/path/preprocess/df.py`. SCRIPT_DIR = str(Path(__file__).resolve().parent) if SCRIPT_DIR in sys.path: sys.path.remove(SCRIPT_DIR) import numpy as np import pandas as pd def parse_args(): parser = argparse.ArgumentParser( description="Recompute relative_score from scalar_score/normalized_score with a configurable percentile." ) parser.add_argument( "--input_csv", default="train_processed_v2.csv", help="Input CSV. Can be either the raw processed CSV or an existing *_with_scores.csv file.", ) parser.add_argument( "--output_csv", default="train_processed_with_scores.csv", help="Output CSV path.", ) parser.add_argument( "--percentile", type=float, default=50, help="Percentile used as the global threshold for relative_score.", ) return parser.parse_args() def main(): args = parse_args() input_csv = Path(args.input_csv) output_csv = Path(args.output_csv) df = pd.read_csv(input_csv) # Support both legacy column names and the newer standardized ones. df = df.rename( columns={ "file_path": "image_path", "caption": "prompt", } ) required_columns = {"image_path", "prompt", "scalar_score"} missing = required_columns - set(df.columns) if missing: raise ValueError(f"Missing required columns in {input_csv}: {sorted(missing)}") # Recompute normalized_score if it is absent so the script works from either source CSV. if "normalized_score" not in df.columns: min_score = df["scalar_score"].min() max_score = df["scalar_score"].max() if min_score == max_score: df["normalized_score"] = 0.0 else: df["normalized_score"] = -5 + (df["scalar_score"] - min_score) * (10.0 / (max_score - min_score)) threshold = np.percentile(df["normalized_score"].values, args.percentile) print(f"Global score threshold (percentile {args.percentile:g}): {threshold:.6f}") df["relative_score"] = df["normalized_score"] - threshold if "kto_label" not in df.columns: df["kto_label"] = np.where(df["relative_score"] > 0, "exclusive_win", "exclusive_lose") new_columns = [ "image_path", "prompt", "scalar_score", "normalized_score", "relative_score", "kto_label", ] df = df[new_columns] pos_count = int((df["relative_score"] > 0).sum()) non_pos_count = int((df["relative_score"] <= 0).sum()) print(f"Positive samples (relative_score > 0): {pos_count}") print(f"Non-positive samples: {non_pos_count}") output_csv.parent.mkdir(parents=True, exist_ok=True) df.to_csv(output_csv, index=False) print(f"Saved to {output_csv} with {len(df)} rows.") if __name__ == "__main__": main()