| |
| import argparse |
| import sys |
| from pathlib import Path |
|
|
| |
| |
| SCRIPT_DIR = str(Path(__file__).resolve().parent) |
| if SCRIPT_DIR in sys.path: |
| sys.path.remove(SCRIPT_DIR) |
|
|
| import numpy as np |
| import pandas as pd |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser( |
| description="Recompute relative_score from scalar_score/normalized_score with a configurable percentile." |
| ) |
| parser.add_argument( |
| "--input_csv", |
| default="train_processed_v2.csv", |
| help="Input CSV. Can be either the raw processed CSV or an existing *_with_scores.csv file.", |
| ) |
| parser.add_argument( |
| "--output_csv", |
| default="train_processed_with_scores.csv", |
| help="Output CSV path.", |
| ) |
| parser.add_argument( |
| "--percentile", |
| type=float, |
| default=50, |
| help="Percentile used as the global threshold for relative_score.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
|
|
| input_csv = Path(args.input_csv) |
| output_csv = Path(args.output_csv) |
|
|
| df = pd.read_csv(input_csv) |
|
|
| |
| df = df.rename( |
| columns={ |
| "file_path": "image_path", |
| "caption": "prompt", |
| } |
| ) |
|
|
| required_columns = {"image_path", "prompt", "scalar_score"} |
| missing = required_columns - set(df.columns) |
| if missing: |
| raise ValueError(f"Missing required columns in {input_csv}: {sorted(missing)}") |
|
|
| |
| if "normalized_score" not in df.columns: |
| min_score = df["scalar_score"].min() |
| max_score = df["scalar_score"].max() |
|
|
| if min_score == max_score: |
| df["normalized_score"] = 0.0 |
| else: |
| df["normalized_score"] = -5 + (df["scalar_score"] - min_score) * (10.0 / (max_score - min_score)) |
|
|
| threshold = np.percentile(df["normalized_score"].values, args.percentile) |
| print(f"Global score threshold (percentile {args.percentile:g}): {threshold:.6f}") |
|
|
| df["relative_score"] = df["normalized_score"] - threshold |
|
|
| if "kto_label" not in df.columns: |
| df["kto_label"] = np.where(df["relative_score"] > 0, "exclusive_win", "exclusive_lose") |
|
|
| new_columns = [ |
| "image_path", |
| "prompt", |
| "scalar_score", |
| "normalized_score", |
| "relative_score", |
| "kto_label", |
| ] |
| df = df[new_columns] |
|
|
| pos_count = int((df["relative_score"] > 0).sum()) |
| non_pos_count = int((df["relative_score"] <= 0).sum()) |
| print(f"Positive samples (relative_score > 0): {pos_count}") |
| print(f"Non-positive samples: {non_pos_count}") |
|
|
| output_csv.parent.mkdir(parents=True, exist_ok=True) |
| df.to_csv(output_csv, index=False) |
| print(f"Saved to {output_csv} with {len(df)} rows.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|