#!/usr/bin/env python3
import argparse
import sys
from pathlib import Path

# Avoid local helper files such as preprocess/copy.py shadowing stdlib modules
# when this script is executed as `python /abs/path/preprocess/df.py`.
SCRIPT_DIR = str(Path(__file__).resolve().parent)
if SCRIPT_DIR in sys.path:
    sys.path.remove(SCRIPT_DIR)

import numpy as np
import pandas as pd


def parse_args():
    parser = argparse.ArgumentParser(
        description="Recompute relative_score from scalar_score/normalized_score with a configurable percentile."
    )
    parser.add_argument(
        "--input_csv",
        default="train_processed_v2.csv",
        help="Input CSV. Can be either the raw processed CSV or an existing *_with_scores.csv file.",
    )
    parser.add_argument(
        "--output_csv",
        default="train_processed_with_scores.csv",
        help="Output CSV path.",
    )
    parser.add_argument(
        "--percentile",
        type=float,
        default=50,
        help="Percentile used as the global threshold for relative_score.",
    )
    return parser.parse_args()


def main():
    args = parse_args()

    input_csv = Path(args.input_csv)
    output_csv = Path(args.output_csv)

    df = pd.read_csv(input_csv)

    # Support both legacy column names and the newer standardized ones.
    df = df.rename(
        columns={
            "file_path": "image_path",
            "caption": "prompt",
        }
    )

    required_columns = {"image_path", "prompt", "scalar_score"}
    missing = required_columns - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns in {input_csv}: {sorted(missing)}")

    # Recompute normalized_score if it is absent so the script works from either source CSV.
    if "normalized_score" not in df.columns:
        min_score = df["scalar_score"].min()
        max_score = df["scalar_score"].max()

        if min_score == max_score:
            df["normalized_score"] = 0.0
        else:
            df["normalized_score"] = -5 + (df["scalar_score"] - min_score) * (10.0 / (max_score - min_score))

    threshold = np.percentile(df["normalized_score"].values, args.percentile)
    print(f"Global score threshold (percentile {args.percentile:g}): {threshold:.6f}")

    df["relative_score"] = df["normalized_score"] - threshold

    if "kto_label" not in df.columns:
        df["kto_label"] = np.where(df["relative_score"] > 0, "exclusive_win", "exclusive_lose")

    new_columns = [
        "image_path",
        "prompt",
        "scalar_score",
        "normalized_score",
        "relative_score",
        "kto_label",
    ]
    df = df[new_columns]

    pos_count = int((df["relative_score"] > 0).sum())
    non_pos_count = int((df["relative_score"] <= 0).sum())
    print(f"Positive samples (relative_score > 0): {pos_count}")
    print(f"Non-positive samples: {non_pos_count}")

    output_csv.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv} with {len(df)} rows.")


if __name__ == "__main__":
    main()