UPO / preprocess /df.py
BryanW's picture
Add files using upload-large-folder tool
29787d6 verified
#!/usr/bin/env python3
import argparse
import sys
from pathlib import Path
# Avoid local helper files such as preprocess/copy.py shadowing stdlib modules
# when this script is executed as `python /abs/path/preprocess/df.py`.
SCRIPT_DIR = str(Path(__file__).resolve().parent)
if SCRIPT_DIR in sys.path:
sys.path.remove(SCRIPT_DIR)
import numpy as np
import pandas as pd
def parse_args():
parser = argparse.ArgumentParser(
description="Recompute relative_score from scalar_score/normalized_score with a configurable percentile."
)
parser.add_argument(
"--input_csv",
default="train_processed_v2.csv",
help="Input CSV. Can be either the raw processed CSV or an existing *_with_scores.csv file.",
)
parser.add_argument(
"--output_csv",
default="train_processed_with_scores.csv",
help="Output CSV path.",
)
parser.add_argument(
"--percentile",
type=float,
default=50,
help="Percentile used as the global threshold for relative_score.",
)
return parser.parse_args()
def main():
args = parse_args()
input_csv = Path(args.input_csv)
output_csv = Path(args.output_csv)
df = pd.read_csv(input_csv)
# Support both legacy column names and the newer standardized ones.
df = df.rename(
columns={
"file_path": "image_path",
"caption": "prompt",
}
)
required_columns = {"image_path", "prompt", "scalar_score"}
missing = required_columns - set(df.columns)
if missing:
raise ValueError(f"Missing required columns in {input_csv}: {sorted(missing)}")
# Recompute normalized_score if it is absent so the script works from either source CSV.
if "normalized_score" not in df.columns:
min_score = df["scalar_score"].min()
max_score = df["scalar_score"].max()
if min_score == max_score:
df["normalized_score"] = 0.0
else:
df["normalized_score"] = -5 + (df["scalar_score"] - min_score) * (10.0 / (max_score - min_score))
threshold = np.percentile(df["normalized_score"].values, args.percentile)
print(f"Global score threshold (percentile {args.percentile:g}): {threshold:.6f}")
df["relative_score"] = df["normalized_score"] - threshold
if "kto_label" not in df.columns:
df["kto_label"] = np.where(df["relative_score"] > 0, "exclusive_win", "exclusive_lose")
new_columns = [
"image_path",
"prompt",
"scalar_score",
"normalized_score",
"relative_score",
"kto_label",
]
df = df[new_columns]
pos_count = int((df["relative_score"] > 0).sum())
non_pos_count = int((df["relative_score"] <= 0).sum())
print(f"Positive samples (relative_score > 0): {pos_count}")
print(f"Non-positive samples: {non_pos_count}")
output_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_csv, index=False)
print(f"Saved to {output_csv} with {len(df)} rows.")
if __name__ == "__main__":
main()