Spaces:
Build error
Build error
| __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] | |
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| import pdb | |
| import tempfile | |
| import re | |
| from constants import * | |
| from src.auto_leaderboard.model_metadata_type import ModelType | |
| global data_component, filter_component | |
| def validate_model_size(s): | |
| pattern = r'^\d+B$|^-$' | |
| if re.match(pattern, s): | |
| return s | |
| else: | |
| return '-' | |
| def upload_file(files): | |
| file_paths = [file.name for file in files] | |
| return file_paths | |
| def prediction_analyse(prediction_content): | |
| # pdb.set_trace() | |
| predictions = prediction_content.split("\n") | |
| # ่ฏปๅ ground_truth JSON ๆไปถ | |
| with open("./file/SEED-Bench-1.json", "r") as file: | |
| ground_truth_data = json.load(file)["questions"] | |
| # ๅฐ ground_truth ๆฐๆฎ่ฝฌๆขไธบไปฅ question_id ไธบ้ฎ็ๅญๅ ธ | |
| ground_truth = {item["question_id"]: item for item in ground_truth_data} | |
| # ๅๅงๅ็ปๆ็ป่ฎกๅญๅ ธ | |
| results = {i: {"correct": 0, "total": 0} for i in range(1, 13)} | |
| # ้ๅ predictions๏ผ่ฎก็ฎๆฏไธช question_type_id ็ๆญฃ็กฎ้ขๆตๆฐๅๆป้ขๆตๆฐ | |
| for prediction in predictions: | |
| # pdb.set_trace() | |
| prediction = prediction.strip() | |
| if not prediction: | |
| continue | |
| try: | |
| prediction = json.loads(prediction) | |
| except json.JSONDecodeError: | |
| print(f"Warning: Skipping invalid JSON data in line: {prediction}") | |
| continue | |
| question_id = prediction["question_id"] | |
| if question_id not in ground_truth: | |
| continue | |
| gt_item = ground_truth[question_id] | |
| question_type_id = gt_item["question_type_id"] | |
| if prediction["prediction"] == gt_item["answer"]: | |
| results[question_type_id]["correct"] += 1 | |
| results[question_type_id]["total"] += 1 | |
| return results | |
| def prediction_analyse_v2(prediction_content): | |
| # pdb.set_trace() | |
| predictions = prediction_content.split("\n") | |
| # ่ฏปๅ ground_truth JSON ๆไปถ | |
| with open("./file/SEED-Bench-2.json", "r") as file: | |
| ground_truth_data = json.load(file)["questions"] | |
| # ๅฐ ground_truth ๆฐๆฎ่ฝฌๆขไธบไปฅ question_id ไธบ้ฎ็ๅญๅ ธ | |
| ground_truth = {item["question_id"]: item for item in ground_truth_data} | |
| # ๅๅงๅ็ปๆ็ป่ฎกๅญๅ ธ | |
| results = {i: {"correct": 0, "total": 0} for i in range(1, 28)} | |
| # ้ๅ predictions๏ผ่ฎก็ฎๆฏไธช question_type_id ็ๆญฃ็กฎ้ขๆตๆฐๅๆป้ขๆตๆฐ | |
| for prediction in predictions: | |
| # pdb.set_trace() | |
| prediction = prediction.strip() | |
| if not prediction: | |
| continue | |
| try: | |
| prediction = json.loads(prediction) | |
| except json.JSONDecodeError: | |
| print(f"Warning: Skipping invalid JSON data in line: {prediction}") | |
| continue | |
| question_id = prediction["question_id"] | |
| if question_id not in ground_truth: | |
| continue | |
| gt_item = ground_truth[question_id] | |
| question_type_id = gt_item["question_type_id"] | |
| if prediction["prediction"] == gt_item["answer"]: | |
| results[question_type_id]["correct"] += 1 | |
| results[question_type_id]["total"] += 1 | |
| return results | |
| def add_new_eval( | |
| input_file, | |
| model_name_textbox: str, | |
| revision_name_textbox: str, | |
| model_type: str, | |
| model_link: str, | |
| model_size: str, | |
| benchmark_version: str, | |
| LLM_type: str, | |
| LLM_name_textbox: str, | |
| Evaluation_dimension: str, | |
| Evaluation_dimension_2: str, | |
| Evaluation_method: str | |
| ): | |
| if input_file is None: | |
| return "Error! Empty file!" | |
| else: | |
| model_size = validate_model_size(model_size) | |
| # v1 evaluation | |
| if benchmark_version == 'v1': | |
| content = input_file.decode("utf-8") | |
| prediction = prediction_analyse(content) | |
| csv_data = pd.read_csv(CSV_DIR) | |
| csv_task_data = pd.read_csv(CSV_TASK_DIR) | |
| Start_dimension, End_dimension = 1, 13 | |
| if Evaluation_dimension == 'Image': | |
| End_dimension = 10 | |
| elif Evaluation_dimension == 'Video': | |
| Start_dimension = 10 | |
| each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)} | |
| # count for average image\video\all | |
| total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10)) | |
| total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13)) | |
| total_image = sum(prediction[i]["total"] for i in range(1, 10)) | |
| total_video = sum(prediction[i]["total"] for i in range(10, 13)) | |
| if Evaluation_dimension != 'Video': | |
| average_accuracy_image = round(total_correct_image / total_image * 100, 1) | |
| average_task_accuracy_image = round(sum(each_task_accuracy[key] for key in range(1,10)) / 9, 1) | |
| else: | |
| average_accuracy_image = 0 | |
| average_task_accuracy_image = 0 | |
| if Evaluation_dimension != 'Image': | |
| average_accuracy_video = round(total_correct_video / total_video * 100, 1) | |
| average_task_accuracy_video = round(sum(each_task_accuracy[key] for key in range(10,13)) / 3, 1) | |
| else: | |
| average_accuracy_video = 0 | |
| average_task_accuracy_video = 0 | |
| if Evaluation_dimension == 'All': | |
| overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1) | |
| overall_task_accuracy = round(sum(each_task_accuracy[key] for key in range(1,13)) / 12, 1) | |
| else: | |
| overall_accuracy = 0 | |
| overall_task_accuracy = 0 | |
| if LLM_type == 'Other': | |
| LLM_name = LLM_name_textbox | |
| else: | |
| LLM_name = LLM_type | |
| if revision_name_textbox == '': | |
| col = csv_data.shape[0] | |
| model_name = model_name_textbox | |
| else: | |
| model_name = revision_name_textbox | |
| model_name_list = csv_data['Model'] | |
| name_list = [name.split(']')[0][1:] for name in model_name_list] | |
| if revision_name_textbox not in name_list: | |
| col = csv_data.shape[0] | |
| else: | |
| col = name_list.index(revision_name_textbox) | |
| if model_link == '': | |
| model_name = model_name # no url | |
| else: | |
| model_name = '[' + model_name + '](' + model_link + ')' | |
| # add new data | |
| new_data = [ | |
| model_type, | |
| model_name, | |
| LLM_name, | |
| model_size, | |
| Evaluation_method, | |
| overall_accuracy, | |
| average_accuracy_image, | |
| average_accuracy_video, | |
| each_task_accuracy[1], | |
| each_task_accuracy[2], | |
| each_task_accuracy[3], | |
| each_task_accuracy[4], | |
| each_task_accuracy[5], | |
| each_task_accuracy[6], | |
| each_task_accuracy[7], | |
| each_task_accuracy[8], | |
| each_task_accuracy[9], | |
| each_task_accuracy[10], | |
| each_task_accuracy[11], | |
| each_task_accuracy[12], | |
| ] | |
| csv_data.loc[col] = new_data | |
| csv_data = csv_data.to_csv(CSV_DIR, index=False) | |
| new_task_data = [ | |
| model_type, | |
| model_name, | |
| LLM_name, | |
| model_size, | |
| Evaluation_method, | |
| overall_task_accuracy, | |
| average_task_accuracy_image, | |
| average_task_accuracy_video, | |
| each_task_accuracy[1], | |
| each_task_accuracy[2], | |
| each_task_accuracy[3], | |
| each_task_accuracy[4], | |
| each_task_accuracy[5], | |
| each_task_accuracy[6], | |
| each_task_accuracy[7], | |
| each_task_accuracy[8], | |
| each_task_accuracy[9], | |
| each_task_accuracy[10], | |
| each_task_accuracy[11], | |
| each_task_accuracy[12], | |
| ] | |
| csv_task_data.loc[col] = new_data | |
| csv_task_data = csv_task_data.to_csv(CSV_TASK_DIR, index=False) | |
| # v2 evaluation | |
| else: | |
| content = input_file.decode("utf-8") | |
| prediction = prediction_analyse_v2(content) | |
| csv_data = pd.read_csv(CSV_V2_DIR) | |
| csv_task_data = pd.read_csv(CSV_V2_TASK_DIR) | |
| Start_dimension, End_dimension = 1, 28 | |
| if Evaluation_dimension_2 == 'Single': | |
| End_dimension = 17 | |
| elif Evaluation_dimension_2 == 'L1': | |
| End_dimension = 23 | |
| elif Evaluation_dimension_2 == 'L2': | |
| End_dimension = 25 | |
| elif Evaluation_dimension_2 == 'L3': | |
| End_dimension = 28 | |
| # pdb.set_trace() | |
| each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)} | |
| average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1) | |
| average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1) | |
| # Single | |
| if Evaluation_dimension_2 == 'Single': | |
| average_multi = 0 | |
| average_video = 0 | |
| average_p1 = 0 | |
| average_p2 = 0 | |
| average_p3 = 0 | |
| average_task_multi = 0 | |
| average_task_video = 0 | |
| average_task_p1 = 0 | |
| average_task_p2 = 0 | |
| average_task_p3 = 0 | |
| else: | |
| average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1) | |
| average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1) | |
| average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1) | |
| average_task_multi = round(sum(each_task_accuracy[key] for key in range(17,19)) / 2, 1) | |
| average_task_video = round(sum(each_task_accuracy[key] for key in range(19,23)) / 4, 1) | |
| average_task_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1) | |
| # L2 | |
| if Evaluation_dimension_2 == 'L2': | |
| average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1) | |
| average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1) | |
| average_p3 = 0 | |
| average_task_p3 = 0 | |
| # L3 | |
| elif Evaluation_dimension_2 == 'L3': | |
| average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1) | |
| average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1) | |
| average_p3 = round(sum(prediction[i]["correct"] for i in range(25, 28)) / sum(prediction[i]["total"] for i in range(25, 28)) * 100, 1) | |
| average_task_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1) | |
| # L1 | |
| else: | |
| average_p2 = 0 | |
| average_task_p2 = 0 | |
| average_p3 = 0 | |
| average_task_p3 = 0 | |
| if LLM_type == 'Other': | |
| LLM_name = LLM_name_textbox | |
| else: | |
| LLM_name = LLM_type | |
| if revision_name_textbox == '': | |
| col = csv_data.shape[0] | |
| model_name = model_name_textbox | |
| else: | |
| model_name = revision_name_textbox | |
| model_name_list = csv_data['Model'] | |
| name_list = [name.split(']')[0][1:] for name in model_name_list] | |
| if revision_name_textbox not in name_list: | |
| col = csv_data.shape[0] | |
| else: | |
| col = name_list.index(revision_name_textbox) | |
| if model_link == '': | |
| model_name = model_name # no url | |
| else: | |
| model_name = '[' + model_name + '](' + model_link + ')' | |
| # add new data | |
| new_data = [ | |
| model_name, | |
| LLM_name, | |
| model_size, | |
| Evaluation_method, | |
| average_single, | |
| average_multi, | |
| average_video, | |
| average_p1, | |
| average_p2, | |
| average_p3, | |
| each_task_accuracy[1], | |
| each_task_accuracy[2], | |
| each_task_accuracy[3], | |
| each_task_accuracy[4], | |
| each_task_accuracy[5], | |
| each_task_accuracy[6], | |
| each_task_accuracy[7], | |
| each_task_accuracy[8], | |
| each_task_accuracy[9], | |
| each_task_accuracy[10], | |
| each_task_accuracy[11], | |
| each_task_accuracy[12], | |
| each_task_accuracy[13], | |
| each_task_accuracy[14], | |
| each_task_accuracy[15], | |
| each_task_accuracy[16], | |
| each_task_accuracy[17], | |
| each_task_accuracy[18], | |
| each_task_accuracy[19], | |
| each_task_accuracy[20], | |
| each_task_accuracy[21], | |
| each_task_accuracy[22], | |
| each_task_accuracy[23], | |
| each_task_accuracy[24], | |
| each_task_accuracy[25], | |
| each_task_accuracy[26], | |
| each_task_accuracy[27] | |
| ] | |
| csv_data.loc[col] = new_data | |
| csv_data = csv_data.to_csv(CSV_V2_DIR, index=False) | |
| new_task_data = [ | |
| model_name, | |
| LLM_name, | |
| model_size, | |
| Evaluation_method, | |
| average_task_single, | |
| average_task_multi, | |
| average_task_video, | |
| average_task_p1, | |
| average_task_p2, | |
| average_task_p3, | |
| each_task_accuracy[1], | |
| each_task_accuracy[2], | |
| each_task_accuracy[3], | |
| each_task_accuracy[4], | |
| each_task_accuracy[5], | |
| each_task_accuracy[6], | |
| each_task_accuracy[7], | |
| each_task_accuracy[8], | |
| each_task_accuracy[9], | |
| each_task_accuracy[10], | |
| each_task_accuracy[11], | |
| each_task_accuracy[12], | |
| each_task_accuracy[13], | |
| each_task_accuracy[14], | |
| each_task_accuracy[15], | |
| each_task_accuracy[16], | |
| each_task_accuracy[17], | |
| each_task_accuracy[18], | |
| each_task_accuracy[19], | |
| each_task_accuracy[20], | |
| each_task_accuracy[21], | |
| each_task_accuracy[22], | |
| each_task_accuracy[23], | |
| each_task_accuracy[24], | |
| each_task_accuracy[25], | |
| each_task_accuracy[26], | |
| each_task_accuracy[27] | |
| ] | |
| csv_task_data.loc[col] = new_task_data | |
| csv_task_data = csv_task_data.to_csv(CSV_V2_TASK_DIR, index=False) | |
| return 0 | |
| def get_baseline_df(average_type): | |
| if average_type == 'All Average': | |
| df = pd.read_csv(CSV_DIR) | |
| else: | |
| df = pd.read_csv(CSV_TASK_DIR) | |
| df = df.sort_values(by="Avg. All", ascending=False) | |
| present_columns = MODEL_INFO + checkbox_group.value | |
| df = df[present_columns] | |
| return df | |
| def get_baseline_v2_df(average_type): | |
| # pdb.set_trace() | |
| if average_type == 'All Average': | |
| df = pd.read_csv(CSV_V2_DIR) | |
| else: | |
| df = pd.read_csv(CSV_V2_TASK_DIR) | |
| df = df.sort_values(by="Avg. Single", ascending=False) | |
| present_columns = MODEL_INFO_V2 + checkbox_group_v2.value | |
| # pdb.set_trace() | |
| df = df[present_columns] | |
| return df | |
| def get_all_df(): | |
| df = pd.read_csv(CSV_DIR) | |
| df = df.sort_values(by="Avg. All", ascending=False) | |
| return df | |
| def get_all_v2_df(): | |
| df = pd.read_csv(CSV_V2_DIR) | |
| df = df.sort_values(by="Avg. P1", ascending=False) | |
| return df | |
| def switch_version(version): | |
| return f"ๅฝๅ็ๆฌ: {version}" | |
| block = gr.Blocks() | |
| with block: | |
| gr.Markdown( | |
| LEADERBORAD_INTRODUCTION | |
| ) | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("๐ SEED Benchmark v2", elem_id="seed-benchmark-tab-table", id=0): | |
| with gr.Row(): | |
| with gr.Accordion("Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| elem_id="citation-button", | |
| ).style(show_copy_button=True) | |
| gr.Markdown( | |
| TABLE_INTRODUCTION | |
| ) | |
| # selection for column part: | |
| checkbox_group_v2 = gr.CheckboxGroup( | |
| choices=TASK_V2_INFO, | |
| value=AVG_V2_INFO, | |
| label="Evaluation Dimension", | |
| interactive=True, | |
| ) | |
| with gr.Row(): | |
| # selection for model size part: | |
| model_size_v2 = gr.CheckboxGroup( | |
| choices=MODEL_SIZE, | |
| value=MODEL_SIZE, | |
| label="Model Size", | |
| interactive=True, | |
| ) | |
| # selection for model size part: | |
| evaluation_method_v2 = gr.CheckboxGroup( | |
| choices=EVALUATION_METHOD, | |
| value=EVALUATION_METHOD, | |
| label="Evaluation Method", | |
| interactive=True, | |
| ) | |
| average_type_v2 = gr.Radio(AVERAGE_TYPE, label="Performance Average Type", value="All Average") | |
| baseline_v2_value = get_baseline_v2_df(average_type_v2.value) | |
| baseline_v2_header = MODEL_INFO_V2 + checkbox_group_v2.value | |
| baseline_v2_datatype = ['markdown'] * 3 + ['number'] * len(checkbox_group_v2.value) | |
| # pdb.set_trace() | |
| # ๅๅปบๆฐๆฎๅธง็ปไปถ | |
| data_component_v2 = gr.components.Dataframe( | |
| value=baseline_v2_value, | |
| headers=baseline_v2_header, | |
| type="pandas", | |
| datatype=baseline_v2_datatype, | |
| interactive=False, | |
| visible=True, | |
| ) | |
| def on_filter_model_size_method_v2_change(selected_model_size, selected_evaluation_method, selected_columns): | |
| updated_data = get_all_v2_df() | |
| # model_size & evaluation_method: | |
| # ่ชๅฎไน่ฟๆปคๅฝๆฐ | |
| def custom_filter(row, model_size_filters, evaluation_method_filters): | |
| model_size = row['Model Size'] | |
| evaluation_method = row['Evaluation Method'] | |
| if model_size == '-': | |
| size_filter = '-' in model_size_filters | |
| elif 'B' in model_size: | |
| size = float(model_size.replace('B', '')) | |
| size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) | |
| else: | |
| size_filter = False | |
| method_filter = evaluation_method in evaluation_method_filters | |
| return size_filter and method_filter | |
| # ไฝฟ็จ่ชๅฎไน่ฟๆปคๅฝๆฐ่ฟๆปคๆฐๆฎ | |
| mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, evaluation_method_filters=selected_evaluation_method) | |
| updated_data = updated_data[mask] | |
| # columns: | |
| selected_columns = [item for item in TASK_V2_INFO if item in selected_columns] | |
| present_columns = MODEL_INFO_V2 + selected_columns | |
| updated_data = updated_data[present_columns] | |
| updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) | |
| updated_headers = present_columns | |
| update_datatype = [DATA_TITILE_V2_TYPE[COLUMN_V2_NAMES.index(x)] for x in updated_headers] | |
| filter_component = gr.components.Dataframe( | |
| value=updated_data, | |
| headers=updated_headers, | |
| type="pandas", | |
| datatype=update_datatype, | |
| interactive=False, | |
| visible=True, | |
| ) | |
| # pdb.set_trace() | |
| return filter_component.value | |
| def on_average_type_v2_change(average_type_v2): | |
| return get_baseline_v2_df(average_type_v2) | |
| average_type_v2.change(fn=on_average_type_v2_change, inputs=[average_type_v2], outputs=data_component_v2) | |
| model_size_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2) | |
| evaluation_method_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2) | |
| checkbox_group_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2) | |
| # table seed-bench-v1 | |
| with gr.TabItem("๐ SEED Benchmark v1", elem_id="seed-benchmark-tab-table", id=1): | |
| with gr.Row(): | |
| with gr.Accordion("Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| elem_id="citation-button", | |
| ).style(show_copy_button=True) | |
| gr.Markdown( | |
| TABLE_INTRODUCTION | |
| ) | |
| # selection for column part: | |
| checkbox_group = gr.CheckboxGroup( | |
| choices=TASK_INFO, | |
| value=AVG_INFO, | |
| label="Evaluation Dimension", | |
| interactive=True, | |
| ) | |
| with gr.Row(): | |
| # selection for model size part: | |
| model_size = gr.CheckboxGroup( | |
| choices=MODEL_SIZE, | |
| value=MODEL_SIZE, | |
| label="Model Size", | |
| interactive=True, | |
| ) | |
| # selection for model size part: | |
| evaluation_method = gr.CheckboxGroup( | |
| choices=EVALUATION_METHOD, | |
| value=EVALUATION_METHOD, | |
| label="Evaluation Method", | |
| interactive=True, | |
| ) | |
| average_type = gr.Radio(AVERAGE_TYPE, label="Performance Average Type", value="All Average") | |
| baseline_value = get_baseline_df(average_type.value) | |
| baseline_header = MODEL_INFO + checkbox_group.value | |
| baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_group.value) | |
| # pdb.set_trace() | |
| # ๅๅปบๆฐๆฎๅธง็ปไปถ | |
| data_component = gr.components.Dataframe( | |
| value=baseline_value, | |
| headers=baseline_header, | |
| type="pandas", | |
| datatype=baseline_datatype, | |
| interactive=False, | |
| visible=True, | |
| ) | |
| def on_filter_model_size_method_change(selected_model_size, selected_evaluation_method, selected_columns): | |
| updated_data = get_all_df() | |
| # model_size & evaluation_method: | |
| # ่ชๅฎไน่ฟๆปคๅฝๆฐ | |
| def custom_filter(row, model_size_filters, evaluation_method_filters): | |
| model_size = row['Model Size'] | |
| evaluation_method = row['Evaluation Method'] | |
| if model_size == '-': | |
| size_filter = '-' in model_size_filters | |
| elif 'B' in model_size: | |
| size = float(model_size.replace('B', '')) | |
| size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) | |
| else: | |
| size_filter = False | |
| method_filter = evaluation_method in evaluation_method_filters | |
| return size_filter and method_filter | |
| # ไฝฟ็จ่ชๅฎไน่ฟๆปคๅฝๆฐ่ฟๆปคๆฐๆฎ | |
| mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, evaluation_method_filters=selected_evaluation_method) | |
| updated_data = updated_data[mask] | |
| # columns: | |
| selected_columns = [item for item in TASK_INFO if item in selected_columns] | |
| present_columns = MODEL_INFO + selected_columns | |
| updated_data = updated_data[present_columns] | |
| updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) | |
| updated_headers = present_columns | |
| update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers] | |
| filter_component = gr.components.Dataframe( | |
| value=updated_data, | |
| headers=updated_headers, | |
| type="pandas", | |
| datatype=update_datatype, | |
| interactive=False, | |
| visible=True, | |
| ) | |
| # pdb.set_trace() | |
| return filter_component.value | |
| def on_average_type_change(average_type): | |
| return get_baseline_df(average_type) | |
| average_type.change(fn=on_average_type_change, inputs=[average_type], outputs=data_component) | |
| model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component) | |
| evaluation_method.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component) | |
| checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component) | |
| # table 2 | |
| with gr.TabItem("๐ About", elem_id="seed-benchmark-tab-table", id=2): | |
| gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text") | |
| # table 3 | |
| with gr.TabItem("๐ Submit here! ", elem_id="seed-benchmark-tab-table", id=3): | |
| gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text") | |
| with gr.Row(): | |
| gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") | |
| with gr.Row(): | |
| gr.Markdown("# โ๏ธโจ Submit your model evaluation json file here!", elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_name_textbox = gr.Textbox( | |
| label="Model name", placeholder="LLaMA-7B" | |
| ) | |
| revision_name_textbox = gr.Textbox( | |
| label="Revision Model Name", placeholder="LLaMA-7B" | |
| ) | |
| model_type = gr.Dropdown( | |
| choices=[ | |
| "LLM", | |
| "ImageLLM", | |
| "VideoLLM", | |
| "Other", | |
| ], | |
| label="Model type", | |
| multiselect=False, | |
| value="ImageLLM", | |
| interactive=True, | |
| ) | |
| model_link = gr.Textbox( | |
| label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf" | |
| ) | |
| model_size = gr.Textbox( | |
| label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')" | |
| ) | |
| benchmark_version= gr.Dropdown( | |
| choices=["v1", "v2"], | |
| label="Benchmark version", | |
| multiselect=False, | |
| value="v1", | |
| interactive=True, | |
| ) | |
| with gr.Column(): | |
| LLM_type = gr.Dropdown( | |
| choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"], | |
| label="LLM type", | |
| multiselect=False, | |
| value="LLaMA-7B", | |
| interactive=True, | |
| ) | |
| LLM_name_textbox = gr.Textbox( | |
| label="LLM model (Required for Other)", | |
| placeholder="LLaMA-13B", | |
| value="LLaMA-13B", | |
| ) | |
| Evaluation_dimension = gr.Dropdown( | |
| choices=["All", "Image", "Video"], | |
| label="Evaluation dimension for SEED-Bench 1(for evaluate SEED-Bench 1)", | |
| multiselect=False, | |
| value="All", | |
| interactive=True, | |
| ) | |
| Evaluation_dimension_2 = gr.Dropdown( | |
| choices=["Single", "L1", "L2", "L3"], | |
| label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)", | |
| multiselect=False, | |
| value="L2", | |
| interactive=True, | |
| ) | |
| Evaluation_method = gr.Dropdown( | |
| choices=EVALUATION_METHOD, | |
| label="Evaluation method", | |
| multiselect=False, | |
| value=EVALUATION_METHOD[0], | |
| interactive=True, | |
| ) | |
| with gr.Column(): | |
| input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary') | |
| submit_button = gr.Button("Submit Eval") | |
| submission_result = gr.Markdown() | |
| submit_button.click( | |
| add_new_eval, | |
| inputs = [ | |
| input_file, | |
| model_name_textbox, | |
| revision_name_textbox, | |
| model_type, | |
| model_link, | |
| model_size, | |
| benchmark_version, | |
| LLM_type, | |
| LLM_name_textbox, | |
| Evaluation_dimension, | |
| Evaluation_dimension_2, | |
| Evaluation_method | |
| ], | |
| ) | |
| def refresh_data(): | |
| value1 = get_baseline_df(average_type) | |
| value2 = get_baseline_v2_df(average_type_v2) | |
| return value1, value2 | |
| with gr.Row(): | |
| data_run = gr.Button("Refresh") | |
| data_run.click( | |
| refresh_data, outputs=[data_component, data_component_v2] | |
| ) | |
| # block.load(get_baseline_df, outputs=data_title) | |
| block.launch() |