LLM4HEP / solution /utils.py
ho22joshua's picture
initial commit
cfcbbc8
Raw
History Blame Contribute Delete
10.6 kB
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from array import array
import ROOT
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tabpfn import TabPFNClassifier
# ensure TabPFN is deterministic
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
os.environ["PYTHONHASHSEED"] = str(42)
torch.set_num_threads(1)
# Note: torch.set_num_interop_threads(1) removed to avoid runtime error
def tabpfn(signal, bkgd, batch_size=20_000, test_size=0.5, random_state=42):
# Set random seeds for reproducibility
torch.manual_seed(random_state)
np.random.seed(random_state)
random.seed(random_state)
os.environ["PYTHONHASHSEED"] = str(random_state)
torch.set_num_threads(1)
# Try to set interop threads, but handle the case where it's already been set
try:
torch.set_num_interop_threads(1)
except RuntimeError:
pass # Interop threads already set, continue
signal = np.nan_to_num(signal).astype(np.float32)
bkgd = np.nan_to_num(bkgd).astype(np.float32)
columns = ['ph1_pt', 'ph1_eta', 'ph1_phi', 'ph2_pt', 'ph2_eta', 'ph2_phi', \
'lep1_pt', 'lep1_eta', 'lep1_phi', 'lep2_pt', 'lep2_eta', 'lep2_phi', \
'jet1_pt', 'jet1_eta', 'jet1_phi', 'jet2_pt', 'jet2_eta', 'jet2_phi', \
'jet3_pt', 'jet3_eta', 'jet3_phi', 'jet4_pt', 'jet4_eta', 'jet4_phi', \
'jet5_pt', 'jet5_eta', 'jet5_phi', 'jet6_pt', 'jet6_eta', 'jet6_phi', \
'met_pt', 'met_phi', 'weight', 'SumWeights', 'XSection', \
'ph1_isTightID', 'ph2_isTightID', \
'scaleFactor_PILEUP', 'scaleFactor_PHOTON', 'scaleFactor_PhotonTRIGGER', \
'scaleFactor_ELE', 'scaleFactor_MUON', 'scaleFactor_LepTRIGGER', 'scaleFactor_BTAG', \
'm_yy', 'pt_yy']
classifier_columns = ['ph1_pt', 'ph2_pt', 'ph1_eta', 'ph2_eta', 'delta_phi']
signal_scores = np.zeros(signal.shape[0])
bkgd_scores = np.zeros(bkgd.shape[0])
signal_df = pd.DataFrame(signal, columns=columns)
signal_df['delta_phi'] = signal_df['ph2_phi'] - signal_df['ph1_phi']
signal_df['ph1_pt'] /= signal_df['m_yy']
signal_df['ph2_pt'] /= signal_df['m_yy']
signal_df = signal_df[classifier_columns]
signal_df.replace([np.inf, -np.inf], 0.0, inplace=True)
signal_df.fillna(0.0, inplace=True)
bkgd_df = pd.DataFrame(bkgd, columns=columns)
bkgd_df['delta_phi'] = bkgd_df['ph2_phi'] - bkgd_df['ph1_phi']
bkgd_df['ph1_pt'] /= bkgd_df['m_yy']
bkgd_df['ph2_pt'] /= bkgd_df['m_yy']
bkgd_df = bkgd_df[classifier_columns]
bkgd_df.replace([np.inf, -np.inf], 0.0, inplace=True)
bkgd_df.fillna(0.0, inplace=True)
signal_df['target'] = 1
bkgd_df['target'] = 0
signal_df_temp = signal_df.iloc[0:batch_size]
bkgd_df_temp = bkgd_df.iloc[0:batch_size]
df = pd.concat([bkgd_df_temp, signal_df_temp])
df = df.sort_values(by='ph1_pt')
df = df.sample(frac=1, random_state=random_state)
x_train, x_test, y_train, y_test = train_test_split(df, df['target'], test_size=test_size, random_state=random_state)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)
clf = TabPFNClassifier(ignore_pretraining_limits=True, device=device)
clf.fit(x_train[df.columns[0:-1]], y_train)
prediction_probabilities = clf.predict_proba(x_test[df.columns[0:-1]])
print('ROC AUC:', roc_auc_score(y_test, prediction_probabilities[:, 1]))
start_idx = 0
bar_format = '{l_bar}{bar:20}{r_bar}{bar:-10b}'
n_iterations = (signal.shape[0] + batch_size - 1) // batch_size # evaluating on full sample takes ~10 min
for _ in tqdm(range(n_iterations), desc='Signal score inference',
unit='batch', bar_format=bar_format, total=n_iterations):
stop_idx = min(start_idx + batch_size, signal.shape[0])
signal_df_temp = signal_df.iloc[start_idx:stop_idx]
signal_scores[start_idx:stop_idx] = clf.predict_proba(signal_df_temp.iloc[:, :-1])[:,1]
start_idx += batch_size
start_idx = 0
n_iterations = (bkgd.shape[0] + batch_size - 1) // batch_size
for _ in tqdm(range(n_iterations), desc='Bkgd score inference',
unit='batch', bar_format=bar_format, total=n_iterations):
stop_idx = min(start_idx + batch_size, bkgd.shape[0])
bkgd_df_temp = bkgd_df.iloc[start_idx:stop_idx]
bkgd_scores[start_idx:stop_idx] = clf.predict_proba(bkgd_df_temp.iloc[:, :-1])[:,1]
start_idx += batch_size
return signal_scores, bkgd_scores
def load_datasets(signal, bkgd, signal_scores, bkgd_scores):
signal_weights = signal[:,32]
bkgd_weights = bkgd[:,32]
# Determine job-specific output directory for ROOT files
# Prefer environment variable OUTPUT_DIR (set by runner), else fallback to CWD
output_dir = os.environ.get('OUTPUT_DIR', os.getcwd())
results_dir = os.path.join(output_dir, 'results')
os.makedirs(results_dir, exist_ok=True)
signal_root_path = os.path.join(results_dir, 'signal.root')
bkgd_root_path = os.path.join(results_dir, 'bkgd.root')
signal_tree = ROOT.TTree('output', 'output')
s_score = array('d', [0.0])
s_weight = array('d', [0.0])
signal_tree.Branch('ml_score', s_score, 'ml_score/D')
signal_tree.Branch('normalized_weight', s_weight, 'normalized_weight/D')
for i in range(len(signal_scores)):
s_score[0] = signal_scores[i]
s_weight[0] = signal_weights[i]
signal_tree.Fill()
signal_file = ROOT.TFile(signal_root_path, 'RECREATE')
signal_tree.Write()
signal_file.Close()
bkgd_tree = ROOT.TTree('output', 'output')
b_score = array('d', [0.0])
b_weight = array('d', [0.0])
bkgd_tree.Branch('ml_score', b_score, 'ml_score/D')
bkgd_tree.Branch('normalized_weight', b_weight, 'normalized_weight/D')
for i in range(len(bkgd_scores)):
b_score[0] = bkgd_scores[i]
b_weight[0] = bkgd_weights[i]
bkgd_tree.Fill()
bkgd_file = ROOT.TFile(bkgd_root_path, 'RECREATE')
bkgd_tree.Write()
bkgd_file.Close()
signal_df = ROOT.RDataFrame('output', signal_root_path)
bkgd_df = ROOT.RDataFrame('output', bkgd_root_path)
return signal_df, bkgd_df
def place_boundary(signal_df, bkgd_df, boundaries, num_bins, min_events):
boundaries = np.array(boundaries)
b_candidates = []
Z_candidates = []
for idx in range(boundaries.shape[0]-1):
start_score = boundaries[idx]
stop_score = boundaries[idx+1]
b, _ = get_optimal_cut_sb(signal_df, bkgd_df, start_score, stop_score, num_bins, min_events)
b_candidates.append(b)
boundaries_copy = np.copy(boundaries)
if b<0:
Z_candidates.append(0)
continue
i = np.searchsorted(boundaries, b)
boundaries_copy = np.insert(boundaries, i, b)
Z = get_significance(signal_df, bkgd_df, boundaries_copy)
Z_candidates.append(Z)
best_idx = np.argmax(Z_candidates)
return float(b_candidates[best_idx]), float(Z_candidates[best_idx])
def get_optimal_cut_sb(signal_df, bkgd_df, start_score, stop_score, num_bins, min_events):
bin_edges = np.linspace(0, 1, num_bins + 1)
score = 'ml_score'
title = 'Signal/Background;ML Score;Event Count'
signal_hist = signal_df.Histo1D(('signal_histogram', title, num_bins, 0, 1), score, 'normalized_weight')
bkgd_hist = bkgd_df.Histo1D(('bkgd_histogram', title, num_bins, 0, 1), score, 'normalized_weight')
signal_hist_unweighted = signal_df.Histo1D(('signal_histogram_unweighted', title, num_bins, 0, 1), score)
bkgd_hist_unweighted = bkgd_df.Histo1D(('bkgd_histogram_unweighted', title, num_bins, 0, 1), score)
# ROOT histogram bins defined s.t. bin containing bin boundary *starts* at bin boundary
# since we want to include start_score and exclude stop_score, we should define stop_bin to be bin *below* bin containing stop_score
start_bin = signal_hist.FindBin(float(start_score))
stop_bin = signal_hist.FindBin(float(stop_score))-1
ZZ = []
candidate_boundaries = []
for b in range(start_bin + 1, stop_bin):
signal_lower_yield = signal_hist.Integral(start_bin, b-1)
signal_upper_yield = signal_hist.Integral(b, stop_bin)
bkgd_lower_yield = bkgd_hist.Integral(start_bin, b-1)
bkgd_upper_yield = bkgd_hist.Integral(b, stop_bin)
signal_lower_counts = signal_hist_unweighted.Integral(start_bin, b-1)
signal_upper_counts = signal_hist_unweighted.Integral(b, stop_bin)
bkgd_lower_counts = bkgd_hist_unweighted.Integral(start_bin, b-1)
bkgd_upper_counts = bkgd_hist_unweighted.Integral(b, stop_bin)
if check_counts_sb(signal_lower_counts, signal_upper_counts,
bkgd_lower_counts, bkgd_upper_counts, min_events):
Z_lower = Z_sb(signal_lower_yield, bkgd_lower_yield)
Z_upper = Z_sb(signal_upper_yield, bkgd_upper_yield)
Z_lower = np.nan_to_num(Z_lower, nan=0.0)
Z_upper = np.nan_to_num(Z_upper, nan=0.0)
Z_tot = Z_comb(np.array([Z_lower, Z_upper]))
ZZ.append(Z_tot)
else:
ZZ.append(0)
candidate_boundaries.append(bin_edges[b])
ZZ = np.array(ZZ)
if len(ZZ) > 0:
optimal_cut = candidate_boundaries[np.argmax(ZZ)]
else:
optimal_cut = -1
return optimal_cut, ZZ
def check_counts_sb(signal_lower_counts, signal_upper_counts, bkgd_lower_counts,
bkgd_upper_counts, min_events):
return min(signal_lower_counts, signal_upper_counts, bkgd_lower_counts,
bkgd_upper_counts) > min_events
def Z_sb(s, b):
s = np.array(s, ndmin=1)
b = np.array(b, ndmin=1)
ZZ = np.zeros_like(b, dtype=np.float64)
mask = b > 0
ZZ[mask] = np.sqrt(2 * ((s[mask] + b[mask]) * np.log(1 + s[mask] / b[mask]) - s[mask]))
return ZZ
def Z_comb(zz):
return np.sqrt(np.sum(zz**2))
def get_significance(signal_df, bkgd_df, boundaries):
boundaries = np.array(boundaries)
ZZ = []
score = 'ml_score'
for idx in range(boundaries.shape[0]-1):
start_score = boundaries[idx]
stop_score = boundaries[idx+1]
selection = f'{score} >= {start_score} && {score} < {stop_score}'
s = signal_df.Filter(selection).Sum('normalized_weight').GetValue()
b = bkgd_df.Filter(selection).Sum('normalized_weight').GetValue()
ZZ.append(Z_sb(s, b))
return float(Z_comb(np.array(ZZ)))