plos-student / app.py
adejumobi's picture
added model_results
e1e3c41 verified
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
import seaborn as sns
import matplotlib.pyplot as plt
import gradio as gr
# Load the results CSV
df_results = pd.read_csv("model_results.csv")
# Extract available models and datasets
ALL_MODELS = sorted(df_results['model'].unique().tolist())
AVAILABLE_DATASETS = sorted(df_results['dataset'].unique().tolist())
# Dataset categorization
DATASET_CATEGORIES = {
"Medical & Healthcare": {
"D1": "Heart Disease (Comprehensive)",
"D2": "Heart attack possibility",
"D3": "Heart Disease Dataset",
"D4": "Liver Disorders",
"D5": "Diabetes Prediction",
"D9": "Chronic Kidney Disease",
"D10": "Breast Cancer Prediction",
"D11": "Stroke Prediction",
"D12": "Lung Cancer Prediction",
"D13": "Hepatitis",
"D15": "Thyroid Disease",
"D16": "Heart Failure Prediction",
"D17": "Parkinson's",
"D18": "Indian Liver Patient",
"D19": "COVID-19 Effect on Liver Cancer",
"D20": "Liver Dataset",
"D21": "Specht Heart",
"D22": "Early-stage Diabetes",
"D23": "Diabetic Retinopathy",
"D24": "Breast Cancer Coimbra",
"D25": "Chronic Kidney Disease",
"D26": "Kidney Stone",
"D28": "Echocardiogram",
"D29": "Bladder Cancer Recurrence",
"D31": "Prostate Cancer",
"D46": "Real Breast Cancer Data",
"D47": "Breast Cancer (Royston)",
"D48": "Lung Cancer Dataset",
"D52": "Cervical Cancer Risk",
"D53": "Breast Cancer Wisconsin",
"D61": "Breast Cancer Prediction",
"D62": "Thyroid Disease",
"D68": "Lung Cancer",
"D69": "Cancer Patients Data",
"D70": "Labor Relations",
"D71": "Glioma Grading",
"D74": "Post-Operative Patient",
"D80": "Heart Rate Stress Monitoring",
"D82": "Diabetes 2019",
"D87": "Personal Heart Disease Indicators",
"D92": "Heart Disease (Logistic)",
"D95": "Diabetes Prediction",
"D97": "Cardiovascular Disease",
"D98": "Diabetes 130 US Hospitals",
"D99": "Heart Disease Dataset",
"D181": "HCV Data",
"D184": "Cardiotocography",
"D189": "Mammographic Mass",
"D199": "Easiest Diabetes",
"D200": "Monkey-Pox Patients",
"D54": "Breast Cancer Wisconsin",
"D63": "Sick-euthyroid",
"D64": "Ann-test",
"D65": "Ann-train",
"D66": "Hypothyroid",
"D67": "New-thyroid",
"D72": "Glioma Grading",
},
"Gaming & Sports": {
"D27": "Chess King-Rook",
"D36": "Tic-Tac-Toe",
"D40": "IPL 2022 Matches",
"D41": "League of Legends",
"D55": "League of Legends Diamond",
"D56": "Chess Game Dataset",
"D57": "Game of Thrones",
"D73": "Connect-4",
"D75": "FIFA 2018",
"D76": "Dota 2 Matches",
"D77": "IPL Match Analysis",
"D78": "CS:GO Professional",
"D79": "IPL 2008-2022",
"D114": "Video Games",
"D115": "Video Games Sales",
"D117": "Sacred Games",
"D118": "PC Games Sales",
"D119": "Popular Video Games",
"D120": "Olympic Games 2021",
"D121": "Video Games ESRB",
"D122": "Top Play Store Games",
"D123": "Steam Games",
"D124": "PS4 Games",
"D116": "Video Games Sales",
},
"Education & Students": {
"D43": "Student Marks",
"D44": "Student 2nd Year Result",
"D45": "Student Mat Pass/Fail",
"D103": "Academic Performance",
"D104": "Student Academic Analysis",
"D105": "Student Dropout Prediction",
"D106": "Electronic Gadgets Impact",
"D107": "Campus Recruitment",
"D108": "End-Semester Performance",
"D109": "Fitbits and Grades",
"D110": "Student Time Management",
"D111": "Student Feedback",
"D112": "Depression & Performance",
"D113": "University Rankings",
"D126": "University Ranking CWUR",
"D127": "University Ranking CWUR 2013-2014",
"D128": "University Ranking CWUR 2014-2015",
"D129": "University Ranking CWUR 2015-2016",
"D130": "University Ranking CWUR 2016-2017",
"D131": "University Ranking CWUR 2017-2018",
"D132": "University Ranking CWUR 2018-2019",
"D133": "University Ranking CWUR 2019-2020",
"D134": "University Ranking CWUR 2020-2021",
"D135": "University Ranking CWUR 2021-2022",
"D136": "University Ranking CWUR 2022-2023",
"D137": "University Ranking GM 2016",
"D138": "University Ranking GM 2017",
"D139": "University Ranking GM 2018",
"D140": "University Ranking GM 2019",
"D141": "University Ranking GM 2020",
"D142": "University Ranking GM 2021",
"D143": "University Ranking GM 2022",
"D144": "University Ranking Webometric 2012",
"D145": "University Ranking Webometric 2013",
"D146": "University Ranking Webometric 2014",
"D147": "University Ranking Webometric 2015",
"D148": "University Ranking Webometric 2016",
"D149": "University Ranking Webometric 2017",
"D150": "University Ranking Webometric 2018",
"D151": "University Ranking Webometric 2019",
"D152": "University Ranking Webometric 2020",
"D153": "University Ranking Webometric 2021",
"D154": "University Ranking Webometric 2022",
"D155": "University Ranking Webometric 2023",
"D156": "University Ranking URAP 2018-2019",
"D157": "University Ranking URAP 2019-2020",
"D158": "University Ranking URAP 2020-2021",
"D159": "University Ranking URAP 2021-2022",
"D160": "University Ranking URAP 2022-2023",
"D161": "University Ranking THE 2011",
"D162": "University Ranking THE 2012",
"D163": "University Ranking THE 2013",
"D164": "University Ranking THE 2014",
"D165": "University Ranking THE 2015",
"D166": "University Ranking THE 2016",
"D167": "University Ranking THE 2017",
"D168": "University Ranking THE 2018",
"D169": "University Ranking THE 2019",
"D170": "University Ranking THE 2020",
"D171": "University Ranking THE 2021",
"D172": "University Ranking THE 2022",
"D173": "University Ranking THE 2023",
"D174": "University Ranking QS 2022",
"D190": "Student Academics Performance"
},
"Banking & Finance": {
"D6": "Bank Marketing 1",
"D7": "Bank Marketing 2",
"D30": "Adult Income",
"D32": "Telco Customer Churn",
"D35": "Credit Approval",
"D50": "Term Deposit Prediction",
"D96": "Credit Card Fraud",
"D188": "South German Credit",
"D193": "Credit Risk Classification",
"D195": "Credit Score Classification",
"D196": "Banking Classification"
},
"Science & Engineering": {
"D8": "Mushroom",
"D14": "Ionosphere",
"D33": "EEG Eye State",
"D37": "Steel Plates Faults",
"D39": "Fertility",
"D51": "Darwin",
"D58": "EEG Emotions",
"D81": "Predictive Maintenance",
"D84": "Oranges vs Grapefruit",
"D90": "Crystal System Li-ion",
"D183": "Drug Consumption",
"D49": "Air Pressure System Failures",
"D93": "Air Pressure System Failures",
"D185": "Toxicity",
"D186": "Toxicity",
},
"Social & Lifestyle": {
"D38": "Online Shoppers",
"D59": "Red Wine Quality",
"D60": "White Wine Quality",
"D88": "Airline Passenger Satisfaction",
"D94": "Go Emotions Google",
"D100": "Spotify East Asian",
"D125": "Suicide Rates",
"D182": "Obesity Levels",
"D187": "Blood Transfusion",
"D191": "Obesity Classification",
"D192": "Gender Classification",
"D194": "Happiness Classification",
"D42": "Airline customer Holiday Booking dataset"
},
"ML Benchmarks & Synthetic": {
"D34": "Spambase",
"D85": "Synthetic Binary",
"D89": "Naive Bayes Data",
"D175": "Monk's Problems 1",
"D176": "Monk's Problems 2",
"D177": "Monk's Problems 3",
"D178": "Monk's Problems 4",
"D179": "Monk's Problems 5",
"D180": "Monk's Problems 6"
},
"Other": {
"D83": "Paris Housing",
"D91": "Fake Bills",
"D197": "Star Classification"
}
}
def compute_metrics(datasets_list, selected_models, metric_for_comparison):
"""Compute metrics and stats for selected datasets and models"""
# Handle "All models" selection
if "All models" in selected_models:
selected_models = ALL_MODELS
# Filter the dataframe for selected datasets and models
df = df_results[
(df_results['dataset'].isin(datasets_list)) &
(df_results['model'].isin(selected_models))
].copy()
if df.empty:
return df, pd.DataFrame(), None
# Statistical comparisons
stat_records = []
models_list = df['model'].unique().tolist()
for i, m1 in enumerate(models_list):
for m2 in models_list[i+1:]:
m1_vals = df[df['model'] == m1].set_index('dataset')[metric_for_comparison]
m2_vals = df[df['model'] == m2].set_index('dataset')[metric_for_comparison]
combined = pd.concat([m1_vals, m2_vals], axis=1, keys=['m1', 'm2']).dropna()
if len(combined) < 2:
continue
t_stat, p_val = ttest_rel(combined['m1'], combined['m2'])
stat_records.append({
"model1": m1,
"model2": m2,
"mean_diff": combined['m1'].mean() - combined['m2'].mean(),
"t_stat": t_stat,
"p_value": p_val,
"significant": "Yes" if p_val < 0.05 else "No"
})
stat_df = pd.DataFrame(stat_records)
# Create visualization
fig = create_heatmap(df, metric_for_comparison)
return df, stat_df, fig
def create_heatmap(df, metric):
"""Create metric by dataset heatmap"""
# Create heatmap of metric by dataset and model
pivot = df.pivot_table(values=metric, index='dataset', columns='model')
fig, ax = plt.subplots(figsize=(12, max(8, len(pivot) * 0.4)))
sns.heatmap(pivot, annot=True, fmt='.3f', cmap='viridis', ax=ax,
cbar_kws={'label': metric.capitalize()})
ax.set_title(f'{metric.capitalize()} by Dataset and Model',
fontsize=14, fontweight='bold')
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Dataset', fontsize=12)
plt.tight_layout()
return fig
def run_evaluation(selected_datasets, selected_models, metric_comparison):
"""Main evaluation function"""
if not selected_datasets:
empty = gr.update(value=None, visible=False)
return "Please select datasets", empty, empty, empty, empty
if not selected_models:
selected_models = ["All models"]
# Ensure metric_comparison is a list
if isinstance(metric_comparison, str):
metric_comparison = [metric_comparison]
if not metric_comparison:
empty = gr.update(value=None, visible=False)
return "Please select at least one metric", empty, empty, empty, empty
# Compute metrics once
df, _, _ = compute_metrics(selected_datasets, selected_models, metric_comparison[0])
if df.empty:
empty = gr.update(value=None, visible=False)
return "No results found", empty, empty, empty, empty
# Create stats and figures for EACH selected metric
all_stats_html = ""
outputs = []
for i, metric in enumerate(metric_comparison):
if i >= 4:
break
_, stat_df, fig = compute_metrics(selected_datasets, selected_models, metric)
if not stat_df.empty:
stats_html = f"""
<h3>Statistical Tests ({metric})</h3>
<p>Paired t-tests comparing model performance (* = significant at p < 0.05)</p>
{stat_df.to_html(index=False, float_format='%.4f')}
<hr>
"""
all_stats_html += stats_html
outputs.append(gr.update(value=fig, visible=True))
# Fill remaining slots with hidden empty plots
while len(outputs) < 4:
outputs.append(gr.update(value=None, visible=False))
if not all_stats_html:
all_stats_html = "<p>Not enough data for statistical comparisons</p>"
return all_stats_html, outputs[0], outputs[1], outputs[2], outputs[3]
# Build Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎯 Model Evaluation Platform
### Compare model performance across different datasets
""")
selected_datasets = gr.State([])
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“Š Select Datasets")
# Create dropdowns for each category
dropdowns = []
for category, datasets in DATASET_CATEGORIES.items():
choices = [f"{did}: {name}" for did, name in datasets.items()
if did in AVAILABLE_DATASETS]
if choices:
dd = gr.Dropdown(
choices=choices,
label=f"{category} ({len(choices)})",
multiselect=True,
value=[]
)
dropdowns.append(dd)
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Evaluation Settings")
summary = gr.Markdown("**0 datasets selected**")
model_input = gr.Dropdown(
choices=["All models"] + ALL_MODELS,
label="Models",
value=["All models"],
multiselect=True
)
metric_comparison = gr.Dropdown(
choices=["accuracy", "precision", "recall", "f1_score"],
label="Metrics to Compare",
value=["accuracy"],
multiselect=True
)
run_btn = gr.Button("πŸš€ Run Evaluation", variant="primary", size="lg")
def update_selection(*dropdown_values):
ids = []
for vals in dropdown_values:
if vals:
ids.extend([v.split(":")[0] for v in vals])
ids = sorted(list(set(ids)))
if ids:
summary_text = f"**βœ“ {len(ids)} dataset{'s' if len(ids) != 1 else ''} selected:** {', '.join(ids)}"
else:
summary_text = "**No datasets selected**"
return summary_text, ids
for dd in dropdowns:
dd.change(update_selection, inputs=dropdowns, outputs=[summary, selected_datasets])
gr.Markdown("---")
gr.Markdown("## πŸ“ˆ Evaluation Results")
output_stats = gr.HTML(label="Statistical Tests")
with gr.Column():
heatmap_output_1 = gr.Plot(label="Heatmap 1", visible=True)
heatmap_output_2 = gr.Plot(label="Heatmap 2", visible=False)
heatmap_output_3 = gr.Plot(label="Heatmap 3", visible=False)
heatmap_output_4 = gr.Plot(label="Heatmap 4", visible=False)
run_btn.click(
run_evaluation,
inputs=[selected_datasets, model_input, metric_comparison],
outputs=[output_stats, heatmap_output_1, heatmap_output_2,
heatmap_output_3, heatmap_output_4]
)
if __name__ == "__main__":
demo.launch()