Spaces:
Sleeping
Sleeping
| """ | |
| scripts/evaluate.py | |
| Evaluation script β runs N episodes and computes reward statistics. | |
| Use this to generate the reward curves required by the judging criteria. | |
| Usage: | |
| python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50 | |
| """ | |
| import argparse | |
| import json | |
| import random | |
| import requests | |
| import statistics | |
| from collections import defaultdict | |
| SAMPLE_TASKS = [ | |
| { | |
| "instruction": "Find all travel and ride receipts from Gmail in the last 10 days.", | |
| "difficulty": "easy", | |
| "available_sources": ["gmail"], | |
| "ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50}, | |
| }, | |
| { | |
| "instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.", | |
| "difficulty": "medium", | |
| "available_sources": ["gmail"], | |
| "ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00}, | |
| }, | |
| { | |
| "instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.", | |
| "difficulty": "hard", | |
| "available_sources": ["gmail"], | |
| "ground_truth": {"answer": None}, | |
| }, | |
| ] | |
| # Simulated agent policy (replace with actual model inference in production) | |
| def simulated_agent_step(obs: dict, step: int) -> dict: | |
| """A rule-based agent for evaluation baselines.""" | |
| if step == 0: | |
| return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"} | |
| elif step == 1: | |
| src = obs["available_sources"][0] | |
| return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src} | |
| elif step == 2 and len(obs["available_sources"]) > 1: | |
| src = obs["available_sources"][1] | |
| return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src} | |
| elif step == 3: | |
| return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"} | |
| elif step == 4: | |
| return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"} | |
| else: | |
| return { | |
| "action_type": "ANSWER", | |
| "content": ( | |
| f"Based on analysis of {', '.join(obs['available_sources'])}, " | |
| f"I found relevant records matching the query: {obs['instruction'][:80]}. " | |
| f"Summary: Retrieved and verified data across all available sources." | |
| ), | |
| } | |
| def run_episode(env_url: str, task: dict) -> dict: | |
| """Run a single episode and return reward statistics.""" | |
| # Reset | |
| resp = requests.post(f"{env_url}/reset", json=task, timeout=10) | |
| if resp.status_code != 200: | |
| return {"error": resp.text} | |
| obs = resp.json() | |
| total_reward = 0.0 | |
| step_rewards = [] | |
| done = False | |
| step = 0 | |
| while not done and step < 10: | |
| action = simulated_agent_step(obs, step) | |
| resp = requests.post(f"{env_url}/step", json=action, timeout=10) | |
| if resp.status_code != 200: | |
| break | |
| data = resp.json() | |
| reward = data["reward"] | |
| done = data["done"] | |
| obs = data["observation"] | |
| total_reward += reward | |
| step_rewards.append(reward) | |
| step += 1 | |
| return { | |
| "difficulty": task["difficulty"], | |
| "total_reward": total_reward, | |
| "steps": step, | |
| "step_rewards": step_rewards, | |
| "done": done, | |
| } | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--env-url", default="http://localhost:8000") | |
| parser.add_argument("--n-episodes", type=int, default=30) | |
| args = parser.parse_args() | |
| print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}") | |
| results_by_difficulty = defaultdict(list) | |
| for i in range(args.n_episodes): | |
| task = random.choice(SAMPLE_TASKS).copy() | |
| result = run_episode(args.env_url, task) | |
| if "error" not in result: | |
| results_by_difficulty[result["difficulty"]].append(result["total_reward"]) | |
| print(f" Episode {i+1:3d} | {task['difficulty']:6s} | reward={result.get('total_reward', 0):.3f}") | |
| print("\nββ Results ββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| for diff, rewards in sorted(results_by_difficulty.items()): | |
| avg = statistics.mean(rewards) if rewards else 0 | |
| mx = max(rewards) if rewards else 0 | |
| mn = min(rewards) if rewards else 0 | |
| print(f" {diff:6s} | n={len(rewards):3d} | avg={avg:.3f} | max={mx:.3f} | min={mn:.3f}") | |
| print("\n[Evaluate] Done. Use these numbers for your reward curve plots.") | |
| if __name__ == "__main__": | |
| main() | |