| import os |
| import random |
| import numpy as np |
| import pandas as pd |
| from tqdm import tqdm |
| from array import array |
| import ROOT |
|
|
| import torch |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import roc_auc_score |
| from tabpfn import TabPFNClassifier |
|
|
| |
| torch.manual_seed(42) |
| np.random.seed(42) |
| random.seed(42) |
| os.environ["PYTHONHASHSEED"] = str(42) |
| torch.set_num_threads(1) |
| |
|
|
|
|
| def tabpfn(signal, bkgd, batch_size=20_000, test_size=0.5, random_state=42): |
| |
| torch.manual_seed(random_state) |
| np.random.seed(random_state) |
| random.seed(random_state) |
| os.environ["PYTHONHASHSEED"] = str(random_state) |
| torch.set_num_threads(1) |
| |
| try: |
| torch.set_num_interop_threads(1) |
| except RuntimeError: |
| pass |
|
|
| signal = np.nan_to_num(signal).astype(np.float32) |
| bkgd = np.nan_to_num(bkgd).astype(np.float32) |
| columns = ['ph1_pt', 'ph1_eta', 'ph1_phi', 'ph2_pt', 'ph2_eta', 'ph2_phi', \ |
| 'lep1_pt', 'lep1_eta', 'lep1_phi', 'lep2_pt', 'lep2_eta', 'lep2_phi', \ |
| 'jet1_pt', 'jet1_eta', 'jet1_phi', 'jet2_pt', 'jet2_eta', 'jet2_phi', \ |
| 'jet3_pt', 'jet3_eta', 'jet3_phi', 'jet4_pt', 'jet4_eta', 'jet4_phi', \ |
| 'jet5_pt', 'jet5_eta', 'jet5_phi', 'jet6_pt', 'jet6_eta', 'jet6_phi', \ |
| 'met_pt', 'met_phi', 'weight', 'SumWeights', 'XSection', \ |
| 'ph1_isTightID', 'ph2_isTightID', \ |
| 'scaleFactor_PILEUP', 'scaleFactor_PHOTON', 'scaleFactor_PhotonTRIGGER', \ |
| 'scaleFactor_ELE', 'scaleFactor_MUON', 'scaleFactor_LepTRIGGER', 'scaleFactor_BTAG', \ |
| 'm_yy', 'pt_yy'] |
| classifier_columns = ['ph1_pt', 'ph2_pt', 'ph1_eta', 'ph2_eta', 'delta_phi'] |
|
|
| signal_scores = np.zeros(signal.shape[0]) |
| bkgd_scores = np.zeros(bkgd.shape[0]) |
|
|
| signal_df = pd.DataFrame(signal, columns=columns) |
| signal_df['delta_phi'] = signal_df['ph2_phi'] - signal_df['ph1_phi'] |
| signal_df['ph1_pt'] /= signal_df['m_yy'] |
| signal_df['ph2_pt'] /= signal_df['m_yy'] |
| signal_df = signal_df[classifier_columns] |
| signal_df.replace([np.inf, -np.inf], 0.0, inplace=True) |
| signal_df.fillna(0.0, inplace=True) |
| |
| bkgd_df = pd.DataFrame(bkgd, columns=columns) |
| bkgd_df['delta_phi'] = bkgd_df['ph2_phi'] - bkgd_df['ph1_phi'] |
| bkgd_df['ph1_pt'] /= bkgd_df['m_yy'] |
| bkgd_df['ph2_pt'] /= bkgd_df['m_yy'] |
| bkgd_df = bkgd_df[classifier_columns] |
| bkgd_df.replace([np.inf, -np.inf], 0.0, inplace=True) |
| bkgd_df.fillna(0.0, inplace=True) |
|
|
| signal_df['target'] = 1 |
| bkgd_df['target'] = 0 |
|
|
| signal_df_temp = signal_df.iloc[0:batch_size] |
| bkgd_df_temp = bkgd_df.iloc[0:batch_size] |
| df = pd.concat([bkgd_df_temp, signal_df_temp]) |
|
|
| df = df.sort_values(by='ph1_pt') |
| df = df.sample(frac=1, random_state=random_state) |
|
|
| x_train, x_test, y_train, y_test = train_test_split(df, df['target'], test_size=test_size, random_state=random_state) |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| print('Device: ', device) |
| clf = TabPFNClassifier(ignore_pretraining_limits=True, device=device) |
| clf.fit(x_train[df.columns[0:-1]], y_train) |
|
|
| prediction_probabilities = clf.predict_proba(x_test[df.columns[0:-1]]) |
| print('ROC AUC:', roc_auc_score(y_test, prediction_probabilities[:, 1])) |
|
|
| start_idx = 0 |
| bar_format = '{l_bar}{bar:20}{r_bar}{bar:-10b}' |
| n_iterations = (signal.shape[0] + batch_size - 1) // batch_size |
| for _ in tqdm(range(n_iterations), desc='Signal score inference', |
| unit='batch', bar_format=bar_format, total=n_iterations): |
| stop_idx = min(start_idx + batch_size, signal.shape[0]) |
| signal_df_temp = signal_df.iloc[start_idx:stop_idx] |
| signal_scores[start_idx:stop_idx] = clf.predict_proba(signal_df_temp.iloc[:, :-1])[:,1] |
| start_idx += batch_size |
|
|
| start_idx = 0 |
| n_iterations = (bkgd.shape[0] + batch_size - 1) // batch_size |
| for _ in tqdm(range(n_iterations), desc='Bkgd score inference', |
| unit='batch', bar_format=bar_format, total=n_iterations): |
| stop_idx = min(start_idx + batch_size, bkgd.shape[0]) |
| bkgd_df_temp = bkgd_df.iloc[start_idx:stop_idx] |
| bkgd_scores[start_idx:stop_idx] = clf.predict_proba(bkgd_df_temp.iloc[:, :-1])[:,1] |
| start_idx += batch_size |
|
|
| return signal_scores, bkgd_scores |
|
|
|
|
| def load_datasets(signal, bkgd, signal_scores, bkgd_scores): |
| signal_weights = signal[:,32] |
| bkgd_weights = bkgd[:,32] |
|
|
| |
| |
| output_dir = os.environ.get('OUTPUT_DIR', os.getcwd()) |
| results_dir = os.path.join(output_dir, 'results') |
| os.makedirs(results_dir, exist_ok=True) |
|
|
| signal_root_path = os.path.join(results_dir, 'signal.root') |
| bkgd_root_path = os.path.join(results_dir, 'bkgd.root') |
|
|
| signal_tree = ROOT.TTree('output', 'output') |
| s_score = array('d', [0.0]) |
| s_weight = array('d', [0.0]) |
| signal_tree.Branch('ml_score', s_score, 'ml_score/D') |
| signal_tree.Branch('normalized_weight', s_weight, 'normalized_weight/D') |
| for i in range(len(signal_scores)): |
| s_score[0] = signal_scores[i] |
| s_weight[0] = signal_weights[i] |
| signal_tree.Fill() |
| signal_file = ROOT.TFile(signal_root_path, 'RECREATE') |
| signal_tree.Write() |
| signal_file.Close() |
|
|
| bkgd_tree = ROOT.TTree('output', 'output') |
| b_score = array('d', [0.0]) |
| b_weight = array('d', [0.0]) |
| bkgd_tree.Branch('ml_score', b_score, 'ml_score/D') |
| bkgd_tree.Branch('normalized_weight', b_weight, 'normalized_weight/D') |
| for i in range(len(bkgd_scores)): |
| b_score[0] = bkgd_scores[i] |
| b_weight[0] = bkgd_weights[i] |
| bkgd_tree.Fill() |
| bkgd_file = ROOT.TFile(bkgd_root_path, 'RECREATE') |
| bkgd_tree.Write() |
| bkgd_file.Close() |
|
|
| signal_df = ROOT.RDataFrame('output', signal_root_path) |
| bkgd_df = ROOT.RDataFrame('output', bkgd_root_path) |
|
|
| return signal_df, bkgd_df |
|
|
|
|
| def place_boundary(signal_df, bkgd_df, boundaries, num_bins, min_events): |
| boundaries = np.array(boundaries) |
| b_candidates = [] |
| Z_candidates = [] |
| for idx in range(boundaries.shape[0]-1): |
| start_score = boundaries[idx] |
| stop_score = boundaries[idx+1] |
| b, _ = get_optimal_cut_sb(signal_df, bkgd_df, start_score, stop_score, num_bins, min_events) |
| b_candidates.append(b) |
|
|
| boundaries_copy = np.copy(boundaries) |
| if b<0: |
| Z_candidates.append(0) |
| continue |
| i = np.searchsorted(boundaries, b) |
| boundaries_copy = np.insert(boundaries, i, b) |
|
|
| Z = get_significance(signal_df, bkgd_df, boundaries_copy) |
| Z_candidates.append(Z) |
|
|
| best_idx = np.argmax(Z_candidates) |
| return float(b_candidates[best_idx]), float(Z_candidates[best_idx]) |
|
|
|
|
| def get_optimal_cut_sb(signal_df, bkgd_df, start_score, stop_score, num_bins, min_events): |
| bin_edges = np.linspace(0, 1, num_bins + 1) |
|
|
| score = 'ml_score' |
| title = 'Signal/Background;ML Score;Event Count' |
| |
| signal_hist = signal_df.Histo1D(('signal_histogram', title, num_bins, 0, 1), score, 'normalized_weight') |
| bkgd_hist = bkgd_df.Histo1D(('bkgd_histogram', title, num_bins, 0, 1), score, 'normalized_weight') |
|
|
| signal_hist_unweighted = signal_df.Histo1D(('signal_histogram_unweighted', title, num_bins, 0, 1), score) |
| bkgd_hist_unweighted = bkgd_df.Histo1D(('bkgd_histogram_unweighted', title, num_bins, 0, 1), score) |
|
|
| |
| |
| start_bin = signal_hist.FindBin(float(start_score)) |
| stop_bin = signal_hist.FindBin(float(stop_score))-1 |
|
|
| ZZ = [] |
| candidate_boundaries = [] |
|
|
| for b in range(start_bin + 1, stop_bin): |
| signal_lower_yield = signal_hist.Integral(start_bin, b-1) |
| signal_upper_yield = signal_hist.Integral(b, stop_bin) |
|
|
| bkgd_lower_yield = bkgd_hist.Integral(start_bin, b-1) |
| bkgd_upper_yield = bkgd_hist.Integral(b, stop_bin) |
|
|
| signal_lower_counts = signal_hist_unweighted.Integral(start_bin, b-1) |
| signal_upper_counts = signal_hist_unweighted.Integral(b, stop_bin) |
|
|
| bkgd_lower_counts = bkgd_hist_unweighted.Integral(start_bin, b-1) |
| bkgd_upper_counts = bkgd_hist_unweighted.Integral(b, stop_bin) |
|
|
| if check_counts_sb(signal_lower_counts, signal_upper_counts, |
| bkgd_lower_counts, bkgd_upper_counts, min_events): |
| Z_lower = Z_sb(signal_lower_yield, bkgd_lower_yield) |
| Z_upper = Z_sb(signal_upper_yield, bkgd_upper_yield) |
| Z_lower = np.nan_to_num(Z_lower, nan=0.0) |
| Z_upper = np.nan_to_num(Z_upper, nan=0.0) |
|
|
| Z_tot = Z_comb(np.array([Z_lower, Z_upper])) |
| ZZ.append(Z_tot) |
| |
| else: |
| ZZ.append(0) |
| |
| candidate_boundaries.append(bin_edges[b]) |
|
|
| ZZ = np.array(ZZ) |
| if len(ZZ) > 0: |
| optimal_cut = candidate_boundaries[np.argmax(ZZ)] |
| else: |
| optimal_cut = -1 |
|
|
| return optimal_cut, ZZ |
|
|
|
|
| def check_counts_sb(signal_lower_counts, signal_upper_counts, bkgd_lower_counts, |
| bkgd_upper_counts, min_events): |
| return min(signal_lower_counts, signal_upper_counts, bkgd_lower_counts, |
| bkgd_upper_counts) > min_events |
|
|
|
|
| def Z_sb(s, b): |
| s = np.array(s, ndmin=1) |
| b = np.array(b, ndmin=1) |
| ZZ = np.zeros_like(b, dtype=np.float64) |
| mask = b > 0 |
| |
| ZZ[mask] = np.sqrt(2 * ((s[mask] + b[mask]) * np.log(1 + s[mask] / b[mask]) - s[mask])) |
|
|
| return ZZ |
|
|
|
|
| def Z_comb(zz): |
| return np.sqrt(np.sum(zz**2)) |
|
|
|
|
| def get_significance(signal_df, bkgd_df, boundaries): |
| boundaries = np.array(boundaries) |
| ZZ = [] |
| score = 'ml_score' |
| for idx in range(boundaries.shape[0]-1): |
| start_score = boundaries[idx] |
| stop_score = boundaries[idx+1] |
| selection = f'{score} >= {start_score} && {score} < {stop_score}' |
|
|
| s = signal_df.Filter(selection).Sum('normalized_weight').GetValue() |
| b = bkgd_df.Filter(selection).Sum('normalized_weight').GetValue() |
|
|
| ZZ.append(Z_sb(s, b)) |
| return float(Z_comb(np.array(ZZ))) |