""" scripts/evaluate.py Evaluation script — runs N episodes and computes reward statistics. Use this to generate the reward curves required by the judging criteria. Usage: python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50 """ import argparse import json import random import requests import statistics from collections import defaultdict SAMPLE_TASKS = [ { "instruction": "Find all travel and ride receipts from Gmail in the last 10 days.", "difficulty": "easy", "available_sources": ["gmail"], "ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50}, }, { "instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.", "difficulty": "medium", "available_sources": ["gmail"], "ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00}, }, { "instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.", "difficulty": "hard", "available_sources": ["gmail"], "ground_truth": {"answer": None}, }, ] # Simulated agent policy (replace with actual model inference in production) def simulated_agent_step(obs: dict, step: int) -> dict: """A rule-based agent for evaluation baselines.""" if step == 0: return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"} elif step == 1: src = obs["available_sources"][0] return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src} elif step == 2 and len(obs["available_sources"]) > 1: src = obs["available_sources"][1] return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src} elif step == 3: return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"} elif step == 4: return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"} else: return { "action_type": "ANSWER", "content": ( f"Based on analysis of {', '.join(obs['available_sources'])}, " f"I found relevant records matching the query: {obs['instruction'][:80]}. " f"Summary: Retrieved and verified data across all available sources." ), } def run_episode(env_url: str, task: dict) -> dict: """Run a single episode and return reward statistics.""" # Reset resp = requests.post(f"{env_url}/reset", json=task, timeout=10) if resp.status_code != 200: return {"error": resp.text} obs = resp.json() total_reward = 0.0 step_rewards = [] done = False step = 0 while not done and step < 10: action = simulated_agent_step(obs, step) resp = requests.post(f"{env_url}/step", json=action, timeout=10) if resp.status_code != 200: break data = resp.json() reward = data["reward"] done = data["done"] obs = data["observation"] total_reward += reward step_rewards.append(reward) step += 1 return { "difficulty": task["difficulty"], "total_reward": total_reward, "steps": step, "step_rewards": step_rewards, "done": done, } def main(): parser = argparse.ArgumentParser() parser.add_argument("--env-url", default="http://localhost:8000") parser.add_argument("--n-episodes", type=int, default=30) args = parser.parse_args() print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}") results_by_difficulty = defaultdict(list) for i in range(args.n_episodes): task = random.choice(SAMPLE_TASKS).copy() result = run_episode(args.env_url, task) if "error" not in result: results_by_difficulty[result["difficulty"]].append(result["total_reward"]) print(f" Episode {i+1:3d} | {task['difficulty']:6s} | reward={result.get('total_reward', 0):.3f}") print("\n── Results ──────────────────────────────────────────────────────") for diff, rewards in sorted(results_by_difficulty.items()): avg = statistics.mean(rewards) if rewards else 0 mx = max(rewards) if rewards else 0 mn = min(rewards) if rewards else 0 print(f" {diff:6s} | n={len(rewards):3d} | avg={avg:.3f} | max={mx:.3f} | min={mn:.3f}") print("\n[Evaluate] Done. Use these numbers for your reward curve plots.") if __name__ == "__main__": main()