trace

Sleeping

File size: 4,796 Bytes

4156f51

"""
scripts/evaluate.py

Evaluation script — runs N episodes and computes reward statistics.
Use this to generate the reward curves required by the judging criteria.

Usage:
    python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50
"""

import argparse
import json
import random
import requests
import statistics
from collections import defaultdict


SAMPLE_TASKS = [
    {
        "instruction": "Find all travel and ride receipts from Gmail in the last 10 days.",
        "difficulty": "easy",
        "available_sources": ["gmail"],
        "ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50},
    },
    {
        "instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.",
        "difficulty": "medium",
        "available_sources": ["gmail"],
        "ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00},
    },
    {
        "instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.",
        "difficulty": "hard",
        "available_sources": ["gmail"],
        "ground_truth": {"answer": None},
    },
]

# Simulated agent policy (replace with actual model inference in production)
def simulated_agent_step(obs: dict, step: int) -> dict:
    """A rule-based agent for evaluation baselines."""
    if step == 0:
        return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"}
    elif step == 1:
        src = obs["available_sources"][0]
        return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
    elif step == 2 and len(obs["available_sources"]) > 1:
        src = obs["available_sources"][1]
        return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
    elif step == 3:
        return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"}
    elif step == 4:
        return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"}
    else:
        return {
            "action_type": "ANSWER",
            "content": (
                f"Based on analysis of {', '.join(obs['available_sources'])}, "
                f"I found relevant records matching the query: {obs['instruction'][:80]}. "
                f"Summary: Retrieved and verified data across all available sources."
            ),
        }


def run_episode(env_url: str, task: dict) -> dict:
    """Run a single episode and return reward statistics."""
    # Reset
    resp = requests.post(f"{env_url}/reset", json=task, timeout=10)
    if resp.status_code != 200:
        return {"error": resp.text}

    obs = resp.json()
    total_reward = 0.0
    step_rewards = []
    done = False
    step = 0

    while not done and step < 10:
        action = simulated_agent_step(obs, step)

        resp = requests.post(f"{env_url}/step", json=action, timeout=10)
        if resp.status_code != 200:
            break

        data = resp.json()
        reward = data["reward"]
        done = data["done"]
        obs = data["observation"]

        total_reward += reward
        step_rewards.append(reward)
        step += 1

    return {
        "difficulty": task["difficulty"],
        "total_reward": total_reward,
        "steps": step,
        "step_rewards": step_rewards,
        "done": done,
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-url", default="http://localhost:8000")
    parser.add_argument("--n-episodes", type=int, default=30)
    args = parser.parse_args()

    print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}")

    results_by_difficulty = defaultdict(list)

    for i in range(args.n_episodes):
        task = random.choice(SAMPLE_TASKS).copy()
        result = run_episode(args.env_url, task)
        if "error" not in result:
            results_by_difficulty[result["difficulty"]].append(result["total_reward"])
        print(f"  Episode {i+1:3d} | {task['difficulty']:6s} | reward={result.get('total_reward', 0):.3f}")

    print("\n── Results ──────────────────────────────────────────────────────")
    for diff, rewards in sorted(results_by_difficulty.items()):
        avg = statistics.mean(rewards) if rewards else 0
        mx = max(rewards) if rewards else 0
        mn = min(rewards) if rewards else 0
        print(f"  {diff:6s} | n={len(rewards):3d} | avg={avg:.3f} | max={mx:.3f} | min={mn:.3f}")

    print("\n[Evaluate] Done. Use these numbers for your reward curve plots.")


if __name__ == "__main__":
    main()