PeptiVerse / training_data_cleaned /binding_affinity_split.py

ynuozhang

update code

baf3373 2 months ago

31.3 kB

	import os
	import math
	from pathlib import Path
	import sys
	from contextlib import contextmanager
	import numpy as np
	import pandas as pd
	import torch
	from tqdm import tqdm
	from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
	from datasets import Dataset, DatasetDict, Features, Value, Sequence as HFSequence
	from transformers import AutoTokenizer, EsmModel, AutoModelForMaskedLM
	from lightning.pytorch import seed_everything
	seed_everything(1986)

	CSV_PATH = Path("./Classifier_Weight/training_data_cleaned/binding_affinity/c-binding_with_openfold_scores.csv")

	OUT_ROOT = Path(
	"./Classifier_Weight/training_data_cleaned/binding_affinity"
	)

	# WT embedding model
	WT_MODEL_NAME = "facebook/esm2_t33_650M_UR50D"
	WT_MAX_LEN = 1022
	WT_BATCH = 32

	# SMILES embedding model + tokenizer
	SMI_MODEL_NAME = "aaronfeller/PeptideCLM-23M-all"
	TOKENIZER_VOCAB = "./Classifier_Weight/tokenizer/new_vocab.txt"
	TOKENIZER_SPLITS = "./Classifier_Weight/tokenizer/new_splits.txt"
	SMI_MAX_LEN = 768
	SMI_BATCH = 128

	# Split config
	TRAIN_FRAC = 0.80
	RANDOM_SEED = 1986
	AFFINITY_Q_BINS = 30

	COL_SEQ1 = "seq1"
	COL_SEQ2 = "seq2"
	COL_AFF = "affinity"
	COL_F2S = "Fasta2SMILES"
	COL_REACT = "REACT_SMILES"
	COL_WT_IPTM = "wt_iptm_score"
	COL_SMI_IPTM = "smiles_iptm_score"

	# Device
	DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


	QUIET = True
	USE_TQDM = False
	LOG_FILE = None

	def log(msg: str):
	if LOG_FILE is not None:
	Path(LOG_FILE).parent.mkdir(parents=True, exist_ok=True)
	with open(LOG_FILE, "a") as f:
	f.write(msg.rstrip() + "\n")
	if not QUIET:
	print(msg)

	def pbar(it, **kwargs):
	return tqdm(it, **kwargs) if USE_TQDM else it

	@contextmanager
	def section(title: str):
	log(f"\n=== {title} ===")
	yield
	log(f"=== done: {title} ===")


	# -------------------------
	# Helpers
	# -------------------------
	def has_uaa(seq: str) -> bool:
	return "X" in str(seq).upper()

	def affinity_to_class(a: float) -> str:
	# High: >= 9 ; Moderate: [7, 9) ; Low: < 7
	if a >= 9.0:
	return "High"
	elif a >= 7.0:
	return "Moderate"
	else:
	return "Low"

	def make_distribution_matched_split(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
	df = df.dropna(subset=[COL_AFF]).reset_index(drop=True)

	df["affinity_class"] = df[COL_AFF].apply(affinity_to_class)

	try:
	df["aff_bin"] = pd.qcut(df[COL_AFF], q=AFFINITY_Q_BINS, duplicates="drop")
	strat_col = "aff_bin"
	except Exception:
	df["aff_bin"] = df["affinity_class"]
	strat_col = "aff_bin"

	rng = np.random.RandomState(RANDOM_SEED)

	df["split"] = None
	for _, g in df.groupby(strat_col, observed=True):
	idx = g.index.to_numpy()
	rng.shuffle(idx)
	n_train = int(math.floor(len(idx) * TRAIN_FRAC))
	df.loc[idx[:n_train], "split"] = "train"
	df.loc[idx[n_train:], "split"] = "val"

	df["split"] = df["split"].fillna("train")
	return df

	def _summ(x):
	x = np.asarray(x, dtype=float)
	x = x[~np.isnan(x)]
	if len(x) == 0:
	return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
	return {
	"n": int(len(x)),
	"mean": float(np.mean(x)),
	"std": float(np.std(x)),
	"p50": float(np.quantile(x, 0.50)),
	"p95": float(np.quantile(x, 0.95)),
	}

	def _len_stats(seqs):
	lens = np.asarray([len(str(s)) for s in seqs], dtype=float)
	if len(lens) == 0:
	return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
	return {
	"n": int(len(lens)),
	"mean": float(lens.mean()),
	"std": float(lens.std()),
	"p50": float(np.quantile(lens, 0.50)),
	"p95": float(np.quantile(lens, 0.95)),
	}

	def verify_split_before_embedding(
	df2: pd.DataFrame,
	affinity_col: str,
	split_col: str,
	seq_col: str,
	iptm_col: str,
	aff_class_col: str = "affinity_class",
	aff_bins: int = 30,
	save_report_prefix: str \| None = None,
	verbose: bool = False,
	):
	df2 = df2.copy()
	df2[affinity_col] = pd.to_numeric(df2[affinity_col], errors="coerce")
	df2[iptm_col] = pd.to_numeric(df2[iptm_col], errors="coerce")

	assert split_col in df2.columns, f"Missing split col: {split_col}"
	assert set(df2[split_col].dropna().unique()).issubset({"train", "val"}), f"Unexpected split values: {df2[split_col].unique()}"
	assert df2[affinity_col].notna().any(), "No valid affinity values after coercion."

	try:
	df2["_aff_bin_dbg"] = pd.qcut(df2[affinity_col], q=aff_bins, duplicates="drop")
	except Exception:
	df2["_aff_bin_dbg"] = df2[aff_class_col].astype(str)

	tr = df2[df2[split_col] == "train"].reset_index(drop=True)
	va = df2[df2[split_col] == "val"].reset_index(drop=True)

	tr_aff = _summ(tr[affinity_col].to_numpy())
	va_aff = _summ(va[affinity_col].to_numpy())
	tr_len = _len_stats(tr[seq_col].tolist())
	va_len = _len_stats(va[seq_col].tolist())

	# bin drift
	bin_ct = (
	df2.groupby([split_col, "_aff_bin_dbg"])
	.size()
	.groupby(level=0)
	.apply(lambda s: s / s.sum())
	)
	tr_bins = bin_ct.loc["train"]
	va_bins = bin_ct.loc["val"]
	all_bins = tr_bins.index.union(va_bins.index)
	tr_bins = tr_bins.reindex(all_bins, fill_value=0.0)
	va_bins = va_bins.reindex(all_bins, fill_value=0.0)
	max_bin_diff = float(np.max(np.abs(tr_bins.values - va_bins.values)))

	msg = (
	f"[split-check] rows={len(df2)} train={len(tr)} val={len(va)} \| "
	f"aff(mean±std) train={tr_aff['mean']:.3f}±{tr_aff['std']:.3f} val={va_aff['mean']:.3f}±{va_aff['std']:.3f} \| "
	f"len(p50/p95) train={tr_len['p50']:.1f}/{tr_len['p95']:.1f} val={va_len['p50']:.1f}/{va_len['p95']:.1f} \| "
	f"max_bin_diff={max_bin_diff:.4f}"
	)
	log(msg)

	if verbose and (not QUIET):
	class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
	class_prop = class_ct.div(class_ct.sum(axis=1), axis=0)
	print("\n[verbose] affinity_class counts:\n", class_ct)
	print("\n[verbose] affinity_class proportions:\n", class_prop.round(4))

	if save_report_prefix is not None:
	out = Path(save_report_prefix)
	out.parent.mkdir(parents=True, exist_ok=True)

	stats_df = pd.DataFrame([
	{"split": "train", {f"aff_{k}": v for k, v in tr_aff.items()}, {f"len_{k}": v for k, v in tr_len.items()}},
	{"split": "val", {f"aff_{k}": v for k, v in va_aff.items()}, {f"len_{k}": v for k, v in va_len.items()}},
	])
	class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
	class_prop = class_ct.div(class_ct.sum(axis=1), axis=0).reset_index()

	stats_df.to_csv(out.with_suffix(".stats.csv"), index=False)
	class_prop.to_csv(out.with_suffix(".class_prop.csv"), index=False)


	# -------------------------
	# WT pooled (ESM2)
	# -------------------------
	@torch.no_grad()
	def wt_pooled_embeddings(seqs, tokenizer, model, batch_size=32, max_length=1022):
	embs = []
	for i in pbar(range(0, len(seqs), batch_size)):
	batch = seqs[i:i + batch_size]
	inputs = tokenizer(
	batch,
	padding=True,
	truncation=True,
	max_length=max_length,
	return_tensors="pt",
	)
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
	out = model(**inputs)
	h = out.last_hidden_state # (B, L, H)

	attn = inputs["attention_mask"].unsqueeze(-1) # (B, L, 1)
	summed = (h * attn).sum(dim=1) # (B, H)
	denom = attn.sum(dim=1).clamp(min=1e-9) # (B, 1)
	pooled = (summed / denom).detach().cpu().numpy()
	embs.append(pooled)

	return np.vstack(embs)


	# -------------------------
	# WT unpooled (ESM2)
	# -------------------------
	@torch.no_grad()
	def wt_unpooled_one(seq, tokenizer, model, cls_id, eos_id, max_length=1022):
	tok = tokenizer(seq, padding=False, truncation=True, max_length=max_length, return_tensors="pt")
	tok = {k: v.to(DEVICE) for k, v in tok.items()}
	out = model(**tok)
	h = out.last_hidden_state[0] # (L, H)
	attn = tok["attention_mask"][0].bool() # (L,)
	ids = tok["input_ids"][0]

	keep = attn.clone()
	if cls_id is not None:
	keep &= (ids != cls_id)
	if eos_id is not None:
	keep &= (ids != eos_id)

	return h[keep].detach().cpu().to(torch.float16).numpy()

	def build_wt_unpooled_dataset(df_split: pd.DataFrame, out_dir: Path, tokenizer, model):
	"""
	Expects df_split to have:
	- target_sequence (seq1)
	- sequence (binder seq2; WT binder)
	- label, affinity_class, COL_AFF, COL_WT_IPTM
	Saves a dataset where each row contains BOTH:
	- target_embedding (Lt,H), target_attention_mask, target_length
	- binder_embedding (Lb,H), binder_attention_mask, binder_length
	"""
	cls_id = tokenizer.cls_token_id
	eos_id = tokenizer.eos_token_id
	H = model.config.hidden_size

	features = Features({
	"target_sequence": Value("string"),
	"sequence": Value("string"),
	"label": Value("float32"),
	"affinity": Value("float32"),
	"affinity_class": Value("string"),

	"target_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
	"target_attention_mask": HFSequence(Value("int8")),
	"target_length": Value("int64"),

	"binder_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
	"binder_attention_mask": HFSequence(Value("int8")),
	"binder_length": Value("int64"),

	COL_WT_IPTM: Value("float32"),
	COL_AFF: Value("float32"),
	})

	def gen_rows(df: pd.DataFrame):
	for r in pbar(df.itertuples(index=False), total=len(df)):
	tgt = str(getattr(r, "target_sequence")).strip()
	bnd = str(getattr(r, "sequence")).strip()

	y = float(getattr(r, "label"))
	aff = float(getattr(r, COL_AFF))
	acls = str(getattr(r, "affinity_class"))

	iptm = getattr(r, COL_WT_IPTM)
	iptm = float(iptm) if pd.notna(iptm) else np.nan

	# token embeddings for target + binder (both ESM)
	t_emb = wt_unpooled_one(tgt, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN) # (Lt,H)
	b_emb = wt_unpooled_one(bnd, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN) # (Lb,H)

	t_list = t_emb.tolist()
	b_list = b_emb.tolist()
	Lt = len(t_list)
	Lb = len(b_list)

	yield {
	"target_sequence": tgt,
	"sequence": bnd,
	"label": np.float32(y),
	"affinity": np.float32(aff),
	"affinity_class": acls,

	"target_embedding": t_list,
	"target_attention_mask": [1] * Lt,
	"target_length": int(Lt),

	"binder_embedding": b_list,
	"binder_attention_mask": [1] * Lb,
	"binder_length": int(Lb),

	COL_WT_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
	COL_AFF: np.float32(aff),
	}

	out_dir.mkdir(parents=True, exist_ok=True)
	ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
	ds.save_to_disk(str(out_dir), max_shard_size="1GB")
	return ds

	def build_smiles_unpooled_paired_dataset(df_split: pd.DataFrame, out_dir: Path, wt_tokenizer, wt_model_unpooled,
	smi_tok, smi_roformer):
	"""
	df_split must have:
	- target_sequence (seq1)
	- sequence (binder smiles string)
	- label, affinity_class, COL_AFF, COL_SMI_IPTM
	Saves rows with:
	target_embedding (Lt,Ht) from ESM
	binder_embedding (Lb,Hb) from PeptideCLM
	"""
	cls_id = wt_tokenizer.cls_token_id
	eos_id = wt_tokenizer.eos_token_id
	Ht = wt_model_unpooled.config.hidden_size

	Hb = getattr(smi_roformer.config, "hidden_size", None)
	if Hb is None:
	Hb = getattr(smi_roformer.config, "dim", None)
	if Hb is None:
	raise ValueError("Cannot infer Hb from smi_roformer config; print(smi_roformer.config) and set Hb manually.")

	features = Features({
	"target_sequence": Value("string"),
	"sequence": Value("string"),
	"label": Value("float32"),
	"affinity": Value("float32"),
	"affinity_class": Value("string"),

	"target_embedding": HFSequence(HFSequence(Value("float16"), length=Ht)),
	"target_attention_mask": HFSequence(Value("int8")),
	"target_length": Value("int64"),

	"binder_embedding": HFSequence(HFSequence(Value("float16"), length=Hb)),
	"binder_attention_mask": HFSequence(Value("int8")),
	"binder_length": Value("int64"),

	COL_SMI_IPTM: Value("float32"),
	COL_AFF: Value("float32"),
	})

	def gen_rows(df: pd.DataFrame):
	for r in pbar(df.itertuples(index=False), total=len(df)):
	tgt = str(getattr(r, "target_sequence")).strip()
	bnd = str(getattr(r, "sequence")).strip()

	y = float(getattr(r, "label"))
	aff = float(getattr(r, COL_AFF))
	acls = str(getattr(r, "affinity_class"))

	iptm = getattr(r, COL_SMI_IPTM)
	iptm = float(iptm) if pd.notna(iptm) else np.nan

	# target token embeddings (ESM)
	t_emb = wt_unpooled_one(tgt, wt_tokenizer, wt_model_unpooled, cls_id, eos_id, max_length=WT_MAX_LEN)
	t_list = t_emb.tolist()
	Lt = len(t_list)

	# binder token embeddings (PeptideCLM)
	_, tok_list, mask_list, lengths = smiles_embed_batch_return_both(
	[bnd], smi_tok, smi_roformer, max_length=SMI_MAX_LEN
	)
	b_emb = tok_list[0]
	b_list = b_emb.tolist()
	Lb = int(lengths[0])
	b_mask = mask_list[0].astype(np.int8).tolist()

	yield {
	"target_sequence": tgt,
	"sequence": bnd,
	"label": np.float32(y),
	"affinity": np.float32(aff),
	"affinity_class": acls,

	"target_embedding": t_list,
	"target_attention_mask": [1] * Lt,
	"target_length": int(Lt),

	"binder_embedding": b_list,
	"binder_attention_mask": [int(x) for x in b_mask],
	"binder_length": int(Lb),

	COL_SMI_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
	COL_AFF: np.float32(aff),
	}

	out_dir.mkdir(parents=True, exist_ok=True)
	ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
	ds.save_to_disk(str(out_dir), max_shard_size="1GB")
	return ds


	# -------------------------
	# SMILES pooled + unpooled (PeptideCLM)
	# -------------------------
	def get_special_ids(tokenizer_obj):
	cand = [
	getattr(tokenizer_obj, "pad_token_id", None),
	getattr(tokenizer_obj, "cls_token_id", None),
	getattr(tokenizer_obj, "sep_token_id", None),
	getattr(tokenizer_obj, "bos_token_id", None),
	getattr(tokenizer_obj, "eos_token_id", None),
	getattr(tokenizer_obj, "mask_token_id", None),
	]
	return sorted({x for x in cand if x is not None})

	@torch.no_grad()
	def smiles_embed_batch_return_both(batch_sequences, tokenizer_obj, model_roformer, max_length):
	tok = tokenizer_obj(
	batch_sequences,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=max_length,
	)
	input_ids = tok["input_ids"].to(DEVICE)
	attention_mask = tok["attention_mask"].to(DEVICE)

	outputs = model_roformer(input_ids=input_ids, attention_mask=attention_mask)
	last_hidden = outputs.last_hidden_state # (B, L, H)

	special_ids = get_special_ids(tokenizer_obj)
	valid = attention_mask.bool()
	if len(special_ids) > 0:
	sid = torch.tensor(special_ids, device=DEVICE, dtype=torch.long)
	if hasattr(torch, "isin"):
	valid = valid & (~torch.isin(input_ids, sid))
	else:
	m = torch.zeros_like(valid)
	for s in special_ids:
	m \|= (input_ids == s)
	valid = valid & (~m)

	valid_f = valid.unsqueeze(-1).float()
	summed = torch.sum(last_hidden * valid_f, dim=1)
	denom = torch.clamp(valid_f.sum(dim=1), min=1e-9)
	pooled = (summed / denom).detach().cpu().numpy()

	token_emb_list, mask_list, lengths = [], [], []
	for b in range(last_hidden.shape[0]):
	emb = last_hidden[b, valid[b]] # (Li, H)
	token_emb_list.append(emb.detach().cpu().to(torch.float16).numpy())
	li = emb.shape[0]
	lengths.append(int(li))
	mask_list.append(np.ones((li,), dtype=np.int8))

	return pooled, token_emb_list, mask_list, lengths

	def smiles_generate_embeddings_batched_both(seqs, tokenizer_obj, model_roformer, batch_size, max_length):
	pooled_all = []
	token_emb_all = []
	mask_all = []
	lengths_all = []

	for i in pbar(range(0, len(seqs), batch_size)):
	batch = seqs[i:i + batch_size]
	pooled, tok_list, m_list, lens = smiles_embed_batch_return_both(
	batch, tokenizer_obj, model_roformer, max_length
	)
	pooled_all.append(pooled)
	token_emb_all.extend(tok_list)
	mask_all.extend(m_list)
	lengths_all.extend(lens)

	return np.vstack(pooled_all), token_emb_all, mask_all, lengths_all

	def build_target_cache_from_wt_view(wt_view_train: pd.DataFrame, wt_view_val: pd.DataFrame):
	wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
	wt_model = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()

	# compute target pooled embeddings once
	tgt_wt_train = wt_view_train["target_sequence"].astype(str).tolist()
	tgt_wt_val = wt_view_val["target_sequence"].astype(str).tolist()

	wt_train_tgt_emb = wt_pooled_embeddings(
	tgt_wt_train, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
	)
	wt_val_tgt_emb = wt_pooled_embeddings(
	tgt_wt_val, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
	)

	# build dict: target_sequence -> embedding
	train_map = {s: e for s, e in zip(tgt_wt_train, wt_train_tgt_emb)}
	val_map = {s: e for s, e in zip(tgt_wt_val, wt_val_tgt_emb)}
	return wt_tok, wt_model, wt_train_tgt_emb, wt_val_tgt_emb, train_map, val_map
	# -------------------------
	# Main
	# -------------------------
	def main():
	log(f"[INFO] DEVICE: {DEVICE}")
	OUT_ROOT.mkdir(parents=True, exist_ok=True)

	with section("load csv + dedup"):
	df = pd.read_csv(CSV_PATH)
	for c in [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]:
	if c in df.columns:
	df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)

	# Dedup
	DEDUP_COLS = [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]
	df = df.drop_duplicates(subset=DEDUP_COLS).reset_index(drop=True)

	print("Rows after dedup on", DEDUP_COLS, ":", len(df))

	need = [COL_SEQ1, COL_SEQ2, COL_AFF, COL_F2S, COL_REACT, COL_WT_IPTM, COL_SMI_IPTM]
	missing = [c for c in need if c not in df.columns]
	if missing:
	raise ValueError(f"Missing required columns: {missing}")

	# numeric affinity for both branches
	df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")

	# WT subset + SMILES subset separately
	with section("prepare wt/smiles subsets"):
	# WT: requires a canonical peptide sequence (no X) + affinity
	df_wt = df.copy()
	df_wt["wt_sequence"] = df_wt[COL_SEQ2].astype(str).str.strip()
	df_wt = df_wt.dropna(subset=[COL_AFF]).reset_index(drop=True)
	df_wt = df_wt[df_wt["wt_sequence"].notna() & (df_wt["wt_sequence"] != "")]
	df_wt = df_wt[~df_wt["wt_sequence"].str.contains("X", case=False, na=False)].reset_index(drop=True)

	# SMILES: requires affinity + a usable picked SMILES (UAA->REACT, else->Fasta2SMILES)
	df_smi = df.copy()
	df_smi = df_smi.dropna(subset=[COL_AFF]).reset_index(drop=True)
	df_smi = df_smi[
	pd.to_numeric(df_smi[COL_SMI_IPTM], errors="coerce").notna()
	].reset_index(drop=True) # empty iptm means sth wrong with their smiles sequence

	is_uaa = df_smi[COL_SEQ2].astype(str).str.contains("X", case=False, na=False)
	df_smi["smiles_sequence"] = np.where(is_uaa, df_smi[COL_REACT], df_smi[COL_F2S])
	df_smi["smiles_sequence"] = df_smi["smiles_sequence"].astype(str).str.strip()
	df_smi = df_smi[df_smi["smiles_sequence"].notna() & (df_smi["smiles_sequence"] != "")]
	df_smi = df_smi[~df_smi["smiles_sequence"].isin(["nan", "None"])].reset_index(drop=True)

	log(f"[counts] WT rows={len(df_wt)} \| SMILES rows={len(df_smi)} (after per-branch filtering)")

	# Split separately
	with section("split wt and smiles separately"):
	df_wt2 = make_distribution_matched_split(df_wt)
	df_smi2 = make_distribution_matched_split(df_smi)

	# save split tables
	wt_split_csv = OUT_ROOT / "binding_affinity_wt_meta_with_split.csv"
	smi_split_csv = OUT_ROOT / "binding_affinity_smiles_meta_with_split.csv"
	df_wt2.to_csv(wt_split_csv, index=False)
	df_smi2.to_csv(smi_split_csv, index=False)
	log(f"Saved WT split meta: {wt_split_csv}")
	log(f"Saved SMILES split meta: {smi_split_csv}")

	verify_split_before_embedding(
	df2=df_wt2,
	affinity_col=COL_AFF,
	split_col="split",
	seq_col="wt_sequence",
	iptm_col=COL_WT_IPTM,
	aff_class_col="affinity_class",
	aff_bins=AFFINITY_Q_BINS,
	save_report_prefix=str(OUT_ROOT / "wt_split_doublecheck_report"),
	verbose=False,
	)
	verify_split_before_embedding(
	df2=df_smi2,
	affinity_col=COL_AFF,
	split_col="split",
	seq_col="smiles_sequence",
	iptm_col=COL_SMI_IPTM,
	aff_class_col="affinity_class",
	aff_bins=AFFINITY_Q_BINS,
	save_report_prefix=str(OUT_ROOT / "smiles_split_doublecheck_report"),
	verbose=False,
	)

	# Prepare split views
	def prep_view(df_in: pd.DataFrame, binder_seq_col: str, iptm_col: str) -> pd.DataFrame:
	out = df_in.copy()
	out["target_sequence"] = out[COL_SEQ1].astype(str).str.strip() # <-- NEW
	out["sequence"] = out[binder_seq_col].astype(str).str.strip() # binder
	out["label"] = pd.to_numeric(out[COL_AFF], errors="coerce")
	out[iptm_col] = pd.to_numeric(out[iptm_col], errors="coerce")
	out[COL_AFF] = pd.to_numeric(out[COL_AFF], errors="coerce")
	out = out.dropna(subset=["target_sequence", "sequence", "label"]).reset_index(drop=True)
	return out[["target_sequence", "sequence", "label", "split", iptm_col, COL_AFF, "affinity_class"]]

	wt_view = prep_view(df_wt2, "wt_sequence", COL_WT_IPTM)
	smi_view = prep_view(df_smi2, "smiles_sequence", COL_SMI_IPTM)

	# -------------------------
	# Split views
	# -------------------------
	wt_train = wt_view[wt_view["split"] == "train"].reset_index(drop=True)
	wt_val = wt_view[wt_view["split"] == "val"].reset_index(drop=True)
	smi_train = smi_view[smi_view["split"] == "train"].reset_index(drop=True)
	smi_val = smi_view[smi_view["split"] == "val"].reset_index(drop=True)


	# =========================
	# TARGET pooled embeddings (ESM) — SEPARATE per branch
	# =========================
	with section("TARGET pooled embeddings (ESM) — WT + SMILES separately"):
	wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
	wt_esm = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()

	# ---- WT targets ----
	wt_train_tgt_emb = wt_pooled_embeddings(
	wt_train["target_sequence"].astype(str).str.strip().tolist(),
	wt_tok, wt_esm,
	batch_size=WT_BATCH,
	max_length=WT_MAX_LEN,
	).astype(np.float32)

	wt_val_tgt_emb = wt_pooled_embeddings(
	wt_val["target_sequence"].astype(str).str.strip().tolist(),
	wt_tok, wt_esm,
	batch_size=WT_BATCH,
	max_length=WT_MAX_LEN,
	).astype(np.float32)

	# ---- SMILES targets ----
	smi_train_tgt_emb = wt_pooled_embeddings(
	smi_train["target_sequence"].astype(str).str.strip().tolist(),
	wt_tok, wt_esm,
	batch_size=WT_BATCH,
	max_length=WT_MAX_LEN,
	).astype(np.float32)

	smi_val_tgt_emb = wt_pooled_embeddings(
	smi_val["target_sequence"].astype(str).str.strip().tolist(),
	wt_tok, wt_esm,
	batch_size=WT_BATCH,
	max_length=WT_MAX_LEN,
	).astype(np.float32)


	# =========================
	# WT pooled binder embeddings (binder = WT peptide)
	# =========================
	with section("WT pooled binder embeddings + save"):
	wt_train_emb = wt_pooled_embeddings(
	wt_train["sequence"].astype(str).str.strip().tolist(),
	wt_tok, wt_esm,
	batch_size=WT_BATCH,
	max_length=WT_MAX_LEN,
	).astype(np.float32)

	wt_val_emb = wt_pooled_embeddings(
	wt_val["sequence"].astype(str).str.strip().tolist(),
	wt_tok, wt_esm,
	batch_size=WT_BATCH,
	max_length=WT_MAX_LEN,
	).astype(np.float32)

	wt_train_ds = Dataset.from_dict({
	"target_sequence": wt_train["target_sequence"].tolist(),
	"sequence": wt_train["sequence"].tolist(),
	"label": wt_train["label"].astype(float).tolist(),
	"target_embedding": wt_train_tgt_emb,
	"embedding": wt_train_emb,
	COL_WT_IPTM: wt_train[COL_WT_IPTM].astype(float).tolist(),
	COL_AFF: wt_train[COL_AFF].astype(float).tolist(),
	"affinity_class": wt_train["affinity_class"].tolist(),
	})

	wt_val_ds = Dataset.from_dict({
	"target_sequence": wt_val["target_sequence"].tolist(),
	"sequence": wt_val["sequence"].tolist(),
	"label": wt_val["label"].astype(float).tolist(),
	"target_embedding": wt_val_tgt_emb,
	"embedding": wt_val_emb,
	COL_WT_IPTM: wt_val[COL_WT_IPTM].astype(float).tolist(),
	COL_AFF: wt_val[COL_AFF].astype(float).tolist(),
	"affinity_class": wt_val["affinity_class"].tolist(),
	})

	wt_pooled_dd = DatasetDict({"train": wt_train_ds, "val": wt_val_ds})
	wt_pooled_out = OUT_ROOT / "pair_wt_wt_pooled"
	wt_pooled_dd.save_to_disk(str(wt_pooled_out))
	log(f"Saved WT pooled -> {wt_pooled_out}")


	# =========================
	# SMILES pooled binder embeddings (binder = SMILES via PeptideCLM)
	# =========================
	with section("SMILES pooled binder embeddings + save"):
	smi_tok = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
	smi_roformer = (
	AutoModelForMaskedLM
	.from_pretrained(SMI_MODEL_NAME)
	.roformer
	.to(DEVICE)
	.eval()
	)

	smi_train_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
	smi_train["sequence"].astype(str).str.strip().tolist(),
	smi_tok, smi_roformer,
	batch_size=SMI_BATCH,
	max_length=SMI_MAX_LEN,
	)

	smi_val_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
	smi_val["sequence"].astype(str).str.strip().tolist(),
	smi_tok, smi_roformer,
	batch_size=SMI_BATCH,
	max_length=SMI_MAX_LEN,
	)

	smi_train_ds = Dataset.from_dict({
	"target_sequence": smi_train["target_sequence"].tolist(),
	"sequence": smi_train["sequence"].tolist(),
	"label": smi_train["label"].astype(float).tolist(),
	"target_embedding": smi_train_tgt_emb,
	"embedding": smi_train_pooled.astype(np.float32),
	COL_SMI_IPTM: smi_train[COL_SMI_IPTM].astype(float).tolist(),
	COL_AFF: smi_train[COL_AFF].astype(float).tolist(),
	"affinity_class": smi_train["affinity_class"].tolist(),
	})

	smi_val_ds = Dataset.from_dict({
	"target_sequence": smi_val["target_sequence"].tolist(),
	"sequence": smi_val["sequence"].tolist(),
	"label": smi_val["label"].astype(float).tolist(),
	"target_embedding": smi_val_tgt_emb,
	"embedding": smi_val_pooled.astype(np.float32),
	COL_SMI_IPTM: smi_val[COL_SMI_IPTM].astype(float).tolist(),
	COL_AFF: smi_val[COL_AFF].astype(float).tolist(),
	"affinity_class": smi_val["affinity_class"].tolist(),
	})

	smi_pooled_dd = DatasetDict({"train": smi_train_ds, "val": smi_val_ds})
	smi_pooled_out = OUT_ROOT / "pair_wt_smiles_pooled"
	smi_pooled_dd.save_to_disk(str(smi_pooled_out))
	log(f"Saved SMILES pooled -> {smi_pooled_out}")


	# =========================
	# WT unpooled paired (ESM target + ESM binder) + save
	# =========================
	with section("WT unpooled paired embeddings + save"):
	wt_tok_unpooled = wt_tok # reuse tokenizer
	wt_esm_unpooled = wt_esm # reuse model

	wt_unpooled_out = OUT_ROOT / "pair_wt_wt_unpooled"
	wt_unpooled_dd = DatasetDict({
	"train": build_wt_unpooled_dataset(wt_train, wt_unpooled_out / "train",
	wt_tok_unpooled, wt_esm_unpooled),
	"val": build_wt_unpooled_dataset(wt_val, wt_unpooled_out / "val",
	wt_tok_unpooled, wt_esm_unpooled),
	})
	wt_unpooled_dd.save_to_disk(str(wt_unpooled_out))
	log(f"Saved WT unpooled -> {wt_unpooled_out}")


	# =========================
	# SMILES unpooled paired (ESM target + PeptideCLM binder) + save
	# =========================
	with section("SMILES unpooled paired embeddings + save"):
	smi_unpooled_out = OUT_ROOT / "pair_wt_smiles_unpooled"
	smi_unpooled_dd = DatasetDict({
	"train": build_smiles_unpooled_paired_dataset(
	smi_train, smi_unpooled_out / "train",
	wt_tok, wt_esm,
	smi_tok, smi_roformer
	),
	"val": build_smiles_unpooled_paired_dataset(
	smi_val, smi_unpooled_out / "val",
	wt_tok, wt_esm,
	smi_tok, smi_roformer
	),
	})
	smi_unpooled_dd.save_to_disk(str(smi_unpooled_out))
	log(f"Saved SMILES unpooled -> {smi_unpooled_out}")

	log(f"\n[DONE] All datasets saved under: {OUT_ROOT}")


	if __name__ == "__main__":
	main()