Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from scipy.stats import ttest_rel | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import gradio as gr | |
| # Load the results CSV | |
| df_results = pd.read_csv("model_results.csv") | |
| # Extract available models and datasets | |
| ALL_MODELS = sorted(df_results['model'].unique().tolist()) | |
| AVAILABLE_DATASETS = sorted(df_results['dataset'].unique().tolist()) | |
| # Dataset categorization | |
| DATASET_CATEGORIES = { | |
| "Medical & Healthcare": { | |
| "D1": "Heart Disease (Comprehensive)", | |
| "D2": "Heart attack possibility", | |
| "D3": "Heart Disease Dataset", | |
| "D4": "Liver Disorders", | |
| "D5": "Diabetes Prediction", | |
| "D9": "Chronic Kidney Disease", | |
| "D10": "Breast Cancer Prediction", | |
| "D11": "Stroke Prediction", | |
| "D12": "Lung Cancer Prediction", | |
| "D13": "Hepatitis", | |
| "D15": "Thyroid Disease", | |
| "D16": "Heart Failure Prediction", | |
| "D17": "Parkinson's", | |
| "D18": "Indian Liver Patient", | |
| "D19": "COVID-19 Effect on Liver Cancer", | |
| "D20": "Liver Dataset", | |
| "D21": "Specht Heart", | |
| "D22": "Early-stage Diabetes", | |
| "D23": "Diabetic Retinopathy", | |
| "D24": "Breast Cancer Coimbra", | |
| "D25": "Chronic Kidney Disease", | |
| "D26": "Kidney Stone", | |
| "D28": "Echocardiogram", | |
| "D29": "Bladder Cancer Recurrence", | |
| "D31": "Prostate Cancer", | |
| "D46": "Real Breast Cancer Data", | |
| "D47": "Breast Cancer (Royston)", | |
| "D48": "Lung Cancer Dataset", | |
| "D52": "Cervical Cancer Risk", | |
| "D53": "Breast Cancer Wisconsin", | |
| "D61": "Breast Cancer Prediction", | |
| "D62": "Thyroid Disease", | |
| "D68": "Lung Cancer", | |
| "D69": "Cancer Patients Data", | |
| "D70": "Labor Relations", | |
| "D71": "Glioma Grading", | |
| "D74": "Post-Operative Patient", | |
| "D80": "Heart Rate Stress Monitoring", | |
| "D82": "Diabetes 2019", | |
| "D87": "Personal Heart Disease Indicators", | |
| "D92": "Heart Disease (Logistic)", | |
| "D95": "Diabetes Prediction", | |
| "D97": "Cardiovascular Disease", | |
| "D98": "Diabetes 130 US Hospitals", | |
| "D99": "Heart Disease Dataset", | |
| "D181": "HCV Data", | |
| "D184": "Cardiotocography", | |
| "D189": "Mammographic Mass", | |
| "D199": "Easiest Diabetes", | |
| "D200": "Monkey-Pox Patients", | |
| "D54": "Breast Cancer Wisconsin", | |
| "D63": "Sick-euthyroid", | |
| "D64": "Ann-test", | |
| "D65": "Ann-train", | |
| "D66": "Hypothyroid", | |
| "D67": "New-thyroid", | |
| "D72": "Glioma Grading", | |
| }, | |
| "Gaming & Sports": { | |
| "D27": "Chess King-Rook", | |
| "D36": "Tic-Tac-Toe", | |
| "D40": "IPL 2022 Matches", | |
| "D41": "League of Legends", | |
| "D55": "League of Legends Diamond", | |
| "D56": "Chess Game Dataset", | |
| "D57": "Game of Thrones", | |
| "D73": "Connect-4", | |
| "D75": "FIFA 2018", | |
| "D76": "Dota 2 Matches", | |
| "D77": "IPL Match Analysis", | |
| "D78": "CS:GO Professional", | |
| "D79": "IPL 2008-2022", | |
| "D114": "Video Games", | |
| "D115": "Video Games Sales", | |
| "D117": "Sacred Games", | |
| "D118": "PC Games Sales", | |
| "D119": "Popular Video Games", | |
| "D120": "Olympic Games 2021", | |
| "D121": "Video Games ESRB", | |
| "D122": "Top Play Store Games", | |
| "D123": "Steam Games", | |
| "D124": "PS4 Games", | |
| "D116": "Video Games Sales", | |
| }, | |
| "Education & Students": { | |
| "D43": "Student Marks", | |
| "D44": "Student 2nd Year Result", | |
| "D45": "Student Mat Pass/Fail", | |
| "D103": "Academic Performance", | |
| "D104": "Student Academic Analysis", | |
| "D105": "Student Dropout Prediction", | |
| "D106": "Electronic Gadgets Impact", | |
| "D107": "Campus Recruitment", | |
| "D108": "End-Semester Performance", | |
| "D109": "Fitbits and Grades", | |
| "D110": "Student Time Management", | |
| "D111": "Student Feedback", | |
| "D112": "Depression & Performance", | |
| "D113": "University Rankings", | |
| "D126": "University Ranking CWUR", | |
| "D127": "University Ranking CWUR 2013-2014", | |
| "D128": "University Ranking CWUR 2014-2015", | |
| "D129": "University Ranking CWUR 2015-2016", | |
| "D130": "University Ranking CWUR 2016-2017", | |
| "D131": "University Ranking CWUR 2017-2018", | |
| "D132": "University Ranking CWUR 2018-2019", | |
| "D133": "University Ranking CWUR 2019-2020", | |
| "D134": "University Ranking CWUR 2020-2021", | |
| "D135": "University Ranking CWUR 2021-2022", | |
| "D136": "University Ranking CWUR 2022-2023", | |
| "D137": "University Ranking GM 2016", | |
| "D138": "University Ranking GM 2017", | |
| "D139": "University Ranking GM 2018", | |
| "D140": "University Ranking GM 2019", | |
| "D141": "University Ranking GM 2020", | |
| "D142": "University Ranking GM 2021", | |
| "D143": "University Ranking GM 2022", | |
| "D144": "University Ranking Webometric 2012", | |
| "D145": "University Ranking Webometric 2013", | |
| "D146": "University Ranking Webometric 2014", | |
| "D147": "University Ranking Webometric 2015", | |
| "D148": "University Ranking Webometric 2016", | |
| "D149": "University Ranking Webometric 2017", | |
| "D150": "University Ranking Webometric 2018", | |
| "D151": "University Ranking Webometric 2019", | |
| "D152": "University Ranking Webometric 2020", | |
| "D153": "University Ranking Webometric 2021", | |
| "D154": "University Ranking Webometric 2022", | |
| "D155": "University Ranking Webometric 2023", | |
| "D156": "University Ranking URAP 2018-2019", | |
| "D157": "University Ranking URAP 2019-2020", | |
| "D158": "University Ranking URAP 2020-2021", | |
| "D159": "University Ranking URAP 2021-2022", | |
| "D160": "University Ranking URAP 2022-2023", | |
| "D161": "University Ranking THE 2011", | |
| "D162": "University Ranking THE 2012", | |
| "D163": "University Ranking THE 2013", | |
| "D164": "University Ranking THE 2014", | |
| "D165": "University Ranking THE 2015", | |
| "D166": "University Ranking THE 2016", | |
| "D167": "University Ranking THE 2017", | |
| "D168": "University Ranking THE 2018", | |
| "D169": "University Ranking THE 2019", | |
| "D170": "University Ranking THE 2020", | |
| "D171": "University Ranking THE 2021", | |
| "D172": "University Ranking THE 2022", | |
| "D173": "University Ranking THE 2023", | |
| "D174": "University Ranking QS 2022", | |
| "D190": "Student Academics Performance" | |
| }, | |
| "Banking & Finance": { | |
| "D6": "Bank Marketing 1", | |
| "D7": "Bank Marketing 2", | |
| "D30": "Adult Income", | |
| "D32": "Telco Customer Churn", | |
| "D35": "Credit Approval", | |
| "D50": "Term Deposit Prediction", | |
| "D96": "Credit Card Fraud", | |
| "D188": "South German Credit", | |
| "D193": "Credit Risk Classification", | |
| "D195": "Credit Score Classification", | |
| "D196": "Banking Classification" | |
| }, | |
| "Science & Engineering": { | |
| "D8": "Mushroom", | |
| "D14": "Ionosphere", | |
| "D33": "EEG Eye State", | |
| "D37": "Steel Plates Faults", | |
| "D39": "Fertility", | |
| "D51": "Darwin", | |
| "D58": "EEG Emotions", | |
| "D81": "Predictive Maintenance", | |
| "D84": "Oranges vs Grapefruit", | |
| "D90": "Crystal System Li-ion", | |
| "D183": "Drug Consumption", | |
| "D49": "Air Pressure System Failures", | |
| "D93": "Air Pressure System Failures", | |
| "D185": "Toxicity", | |
| "D186": "Toxicity", | |
| }, | |
| "Social & Lifestyle": { | |
| "D38": "Online Shoppers", | |
| "D59": "Red Wine Quality", | |
| "D60": "White Wine Quality", | |
| "D88": "Airline Passenger Satisfaction", | |
| "D94": "Go Emotions Google", | |
| "D100": "Spotify East Asian", | |
| "D125": "Suicide Rates", | |
| "D182": "Obesity Levels", | |
| "D187": "Blood Transfusion", | |
| "D191": "Obesity Classification", | |
| "D192": "Gender Classification", | |
| "D194": "Happiness Classification", | |
| "D42": "Airline customer Holiday Booking dataset" | |
| }, | |
| "ML Benchmarks & Synthetic": { | |
| "D34": "Spambase", | |
| "D85": "Synthetic Binary", | |
| "D89": "Naive Bayes Data", | |
| "D175": "Monk's Problems 1", | |
| "D176": "Monk's Problems 2", | |
| "D177": "Monk's Problems 3", | |
| "D178": "Monk's Problems 4", | |
| "D179": "Monk's Problems 5", | |
| "D180": "Monk's Problems 6" | |
| }, | |
| "Other": { | |
| "D83": "Paris Housing", | |
| "D91": "Fake Bills", | |
| "D197": "Star Classification" | |
| } | |
| } | |
| def compute_metrics(datasets_list, selected_models, metric_for_comparison): | |
| """Compute metrics and stats for selected datasets and models""" | |
| # Handle "All models" selection | |
| if "All models" in selected_models: | |
| selected_models = ALL_MODELS | |
| # Filter the dataframe for selected datasets and models | |
| df = df_results[ | |
| (df_results['dataset'].isin(datasets_list)) & | |
| (df_results['model'].isin(selected_models)) | |
| ].copy() | |
| if df.empty: | |
| return df, pd.DataFrame(), None | |
| # Statistical comparisons | |
| stat_records = [] | |
| models_list = df['model'].unique().tolist() | |
| for i, m1 in enumerate(models_list): | |
| for m2 in models_list[i+1:]: | |
| m1_vals = df[df['model'] == m1].set_index('dataset')[metric_for_comparison] | |
| m2_vals = df[df['model'] == m2].set_index('dataset')[metric_for_comparison] | |
| combined = pd.concat([m1_vals, m2_vals], axis=1, keys=['m1', 'm2']).dropna() | |
| if len(combined) < 2: | |
| continue | |
| t_stat, p_val = ttest_rel(combined['m1'], combined['m2']) | |
| stat_records.append({ | |
| "model1": m1, | |
| "model2": m2, | |
| "mean_diff": combined['m1'].mean() - combined['m2'].mean(), | |
| "t_stat": t_stat, | |
| "p_value": p_val, | |
| "significant": "Yes" if p_val < 0.05 else "No" | |
| }) | |
| stat_df = pd.DataFrame(stat_records) | |
| # Create visualization | |
| fig = create_heatmap(df, metric_for_comparison) | |
| return df, stat_df, fig | |
| def create_heatmap(df, metric): | |
| """Create metric by dataset heatmap""" | |
| # Create heatmap of metric by dataset and model | |
| pivot = df.pivot_table(values=metric, index='dataset', columns='model') | |
| fig, ax = plt.subplots(figsize=(12, max(8, len(pivot) * 0.4))) | |
| sns.heatmap(pivot, annot=True, fmt='.3f', cmap='viridis', ax=ax, | |
| cbar_kws={'label': metric.capitalize()}) | |
| ax.set_title(f'{metric.capitalize()} by Dataset and Model', | |
| fontsize=14, fontweight='bold') | |
| ax.set_xlabel('Model', fontsize=12) | |
| ax.set_ylabel('Dataset', fontsize=12) | |
| plt.tight_layout() | |
| return fig | |
| def run_evaluation(selected_datasets, selected_models, metric_comparison): | |
| """Main evaluation function""" | |
| if not selected_datasets: | |
| empty = gr.update(value=None, visible=False) | |
| return "Please select datasets", empty, empty, empty, empty | |
| if not selected_models: | |
| selected_models = ["All models"] | |
| # Ensure metric_comparison is a list | |
| if isinstance(metric_comparison, str): | |
| metric_comparison = [metric_comparison] | |
| if not metric_comparison: | |
| empty = gr.update(value=None, visible=False) | |
| return "Please select at least one metric", empty, empty, empty, empty | |
| # Compute metrics once | |
| df, _, _ = compute_metrics(selected_datasets, selected_models, metric_comparison[0]) | |
| if df.empty: | |
| empty = gr.update(value=None, visible=False) | |
| return "No results found", empty, empty, empty, empty | |
| # Create stats and figures for EACH selected metric | |
| all_stats_html = "" | |
| outputs = [] | |
| for i, metric in enumerate(metric_comparison): | |
| if i >= 4: | |
| break | |
| _, stat_df, fig = compute_metrics(selected_datasets, selected_models, metric) | |
| if not stat_df.empty: | |
| stats_html = f""" | |
| <h3>Statistical Tests ({metric})</h3> | |
| <p>Paired t-tests comparing model performance (* = significant at p < 0.05)</p> | |
| {stat_df.to_html(index=False, float_format='%.4f')} | |
| <hr> | |
| """ | |
| all_stats_html += stats_html | |
| outputs.append(gr.update(value=fig, visible=True)) | |
| # Fill remaining slots with hidden empty plots | |
| while len(outputs) < 4: | |
| outputs.append(gr.update(value=None, visible=False)) | |
| if not all_stats_html: | |
| all_stats_html = "<p>Not enough data for statistical comparisons</p>" | |
| return all_stats_html, outputs[0], outputs[1], outputs[2], outputs[3] | |
| # Build Gradio Interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π― Model Evaluation Platform | |
| ### Compare model performance across different datasets | |
| """) | |
| selected_datasets = gr.State([]) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Select Datasets") | |
| # Create dropdowns for each category | |
| dropdowns = [] | |
| for category, datasets in DATASET_CATEGORIES.items(): | |
| choices = [f"{did}: {name}" for did, name in datasets.items() | |
| if did in AVAILABLE_DATASETS] | |
| if choices: | |
| dd = gr.Dropdown( | |
| choices=choices, | |
| label=f"{category} ({len(choices)})", | |
| multiselect=True, | |
| value=[] | |
| ) | |
| dropdowns.append(dd) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Evaluation Settings") | |
| summary = gr.Markdown("**0 datasets selected**") | |
| model_input = gr.Dropdown( | |
| choices=["All models"] + ALL_MODELS, | |
| label="Models", | |
| value=["All models"], | |
| multiselect=True | |
| ) | |
| metric_comparison = gr.Dropdown( | |
| choices=["accuracy", "precision", "recall", "f1_score"], | |
| label="Metrics to Compare", | |
| value=["accuracy"], | |
| multiselect=True | |
| ) | |
| run_btn = gr.Button("π Run Evaluation", variant="primary", size="lg") | |
| def update_selection(*dropdown_values): | |
| ids = [] | |
| for vals in dropdown_values: | |
| if vals: | |
| ids.extend([v.split(":")[0] for v in vals]) | |
| ids = sorted(list(set(ids))) | |
| if ids: | |
| summary_text = f"**β {len(ids)} dataset{'s' if len(ids) != 1 else ''} selected:** {', '.join(ids)}" | |
| else: | |
| summary_text = "**No datasets selected**" | |
| return summary_text, ids | |
| for dd in dropdowns: | |
| dd.change(update_selection, inputs=dropdowns, outputs=[summary, selected_datasets]) | |
| gr.Markdown("---") | |
| gr.Markdown("## π Evaluation Results") | |
| output_stats = gr.HTML(label="Statistical Tests") | |
| with gr.Column(): | |
| heatmap_output_1 = gr.Plot(label="Heatmap 1", visible=True) | |
| heatmap_output_2 = gr.Plot(label="Heatmap 2", visible=False) | |
| heatmap_output_3 = gr.Plot(label="Heatmap 3", visible=False) | |
| heatmap_output_4 = gr.Plot(label="Heatmap 4", visible=False) | |
| run_btn.click( | |
| run_evaluation, | |
| inputs=[selected_datasets, model_input, metric_comparison], | |
| outputs=[output_stats, heatmap_output_1, heatmap_output_2, | |
| heatmap_output_3, heatmap_output_4] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |