| | import fnmatch |
| | import torch |
| | from dataclasses import dataclass, replace |
| | from bigcode_eval.tasks import ALL_TASKS |
| | from bigcode_eval.evaluator import Evaluator |
| | from dmx.compressor import config_rules |
| | from dmx.compressor.modeling import DmxModel |
| | from transformers import ( AutoModelForCausalLM, AutoTokenizer ) |
| | import traceback |
| |
|
| | @dataclass |
| | class BigcodeEvalArguments: |
| | prefix: str = "" |
| | do_sample: bool = True |
| | temperature: float = 0.8 |
| | top_k: int = 0 |
| | top_p: float = 0.95 |
| | n_samples: int = 10 |
| | eos: str = "<|endoftext|>" |
| | seed: int = 0 |
| | modeltype: str = "causal" |
| | instruction_tokens: str = None |
| | batch_size: int = 2 |
| | max_length_generation: int = 1024 |
| | limit: int = None |
| | limit_start: int = 0 |
| | metric_output_path: str = "evaluation_results.json" |
| | save_every_k_tasks: int = -1 |
| | postprocess: bool = True |
| | allow_code_execution: bool = True |
| | generation_only: bool = False |
| | load_generations_path: str = None |
| | load_data_path: str = None |
| | save_generations: bool = False |
| | load_generations_intermediate_paths: str = None |
| | save_generations_path: str = "generations.json" |
| | save_references: bool = False |
| | save_references_path: str = "references.json" |
| | prompt: str = "prompt" |
| | max_memory_per_gpu: str = None |
| | check_references: bool = False |
| |
|
| | def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None): |
| | """ |
| | Run code evaluation on the provided task using the specified model and tokenizer. |
| | |
| | Args: |
| | model: The model to use for evaluation. |
| | tokenizer: The tokenizer to use for evaluation. |
| | task: The task to evaluate. |
| | accelerator: Optional Accelerator instance. |
| | args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments. |
| | |
| | Returns: |
| | result: A dictionary containing metric and result. |
| | """ |
| | |
| | if accelerator is None: |
| | from accelerate import Accelerator |
| | accelerator = Accelerator() |
| |
|
| | |
| | eval_args = BigcodeEvalArguments() |
| | if args is not None: |
| | eval_args = replace(eval_args, **args) |
| |
|
| | |
| | if not fnmatch.filter(ALL_TASKS, task): |
| | raise ValueError(f"Invalid task: {task}") |
| |
|
| | |
| | if dmx_config is not None: |
| | model = DmxModel.from_torch(model).to("cuda") |
| | tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") |
| | model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}")) |
| | setup = model(tensor) |
| | else: |
| | model = model.to("cuda") |
| | tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") |
| | setup = model(tensor) |
| |
|
| | |
| | if not tokenizer.eos_token: |
| | if tokenizer.bos_token: |
| | tokenizer.eos_token = tokenizer.bos_token |
| | print("bos_token used as eos_token") |
| | else: |
| | raise ValueError("No eos_token or bos_token found") |
| | try: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | except AttributeError: |
| | print("Not setting pad_token to eos_token") |
| | pass |
| |
|
| | evaluator = Evaluator(accelerator, model, tokenizer, eval_args) |
| |
|
| | try: |
| | unparsed_result = evaluator.evaluate(task) |
| | except Exception as e: |
| | print(f"Error evaluating task {task}: {e}") |
| |
|
| | if eval_args.n_samples == 1: |
| | result = {task: {"pass@1": unparsed_result["pass@1"]}} |
| | elif eval_args.n_samples == 10: |
| | result = {task: {"pass@10": unparsed_result["pass@10"]}} |
| | else: |
| | result = {task: unparsed_result} |
| |
|
| | return result |
| |
|
| | def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1): |
| | model_kwargs = { |
| | "revision": revision_name, |
| | "trust_remote_code": True, |
| | } |
| |
|
| | if pass_k == 10: |
| | eval_args = { |
| | "max_length_generation": 1024, |
| | "batch_size": 2, |
| | "n_samples": 10, |
| | "temperature": 0.8, |
| | "top_p": 0.95, |
| | } |
| | else: |
| | eval_args = { |
| | "max_length_generation": 1024, |
| | "batch_size": 1, |
| | "n_samples": 1, |
| | "do_sample": False, |
| | "temperature": None, |
| | "top_p": None, |
| | "top_k": None, |
| | } |
| | |
| | model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs) |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | model_repo_name, |
| | **model_kwargs, |
| | padding_side="right", |
| | ) |
| |
|
| | try: |
| | result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args) |
| | return result, None |
| | except Exception as e: |
| | error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}" |
| | print(error_message) |
| | return None, error_message |
| |
|