trace / scripts /evaluate.py
Ayush
Update
4156f51
"""
scripts/evaluate.py
Evaluation script β€” runs N episodes and computes reward statistics.
Use this to generate the reward curves required by the judging criteria.
Usage:
python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50
"""
import argparse
import json
import random
import requests
import statistics
from collections import defaultdict
SAMPLE_TASKS = [
{
"instruction": "Find all travel and ride receipts from Gmail in the last 10 days.",
"difficulty": "easy",
"available_sources": ["gmail"],
"ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50},
},
{
"instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.",
"difficulty": "medium",
"available_sources": ["gmail"],
"ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00},
},
{
"instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.",
"difficulty": "hard",
"available_sources": ["gmail"],
"ground_truth": {"answer": None},
},
]
# Simulated agent policy (replace with actual model inference in production)
def simulated_agent_step(obs: dict, step: int) -> dict:
"""A rule-based agent for evaluation baselines."""
if step == 0:
return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"}
elif step == 1:
src = obs["available_sources"][0]
return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
elif step == 2 and len(obs["available_sources"]) > 1:
src = obs["available_sources"][1]
return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
elif step == 3:
return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"}
elif step == 4:
return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"}
else:
return {
"action_type": "ANSWER",
"content": (
f"Based on analysis of {', '.join(obs['available_sources'])}, "
f"I found relevant records matching the query: {obs['instruction'][:80]}. "
f"Summary: Retrieved and verified data across all available sources."
),
}
def run_episode(env_url: str, task: dict) -> dict:
"""Run a single episode and return reward statistics."""
# Reset
resp = requests.post(f"{env_url}/reset", json=task, timeout=10)
if resp.status_code != 200:
return {"error": resp.text}
obs = resp.json()
total_reward = 0.0
step_rewards = []
done = False
step = 0
while not done and step < 10:
action = simulated_agent_step(obs, step)
resp = requests.post(f"{env_url}/step", json=action, timeout=10)
if resp.status_code != 200:
break
data = resp.json()
reward = data["reward"]
done = data["done"]
obs = data["observation"]
total_reward += reward
step_rewards.append(reward)
step += 1
return {
"difficulty": task["difficulty"],
"total_reward": total_reward,
"steps": step,
"step_rewards": step_rewards,
"done": done,
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--env-url", default="http://localhost:8000")
parser.add_argument("--n-episodes", type=int, default=30)
args = parser.parse_args()
print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}")
results_by_difficulty = defaultdict(list)
for i in range(args.n_episodes):
task = random.choice(SAMPLE_TASKS).copy()
result = run_episode(args.env_url, task)
if "error" not in result:
results_by_difficulty[result["difficulty"]].append(result["total_reward"])
print(f" Episode {i+1:3d} | {task['difficulty']:6s} | reward={result.get('total_reward', 0):.3f}")
print("\n── Results ──────────────────────────────────────────────────────")
for diff, rewards in sorted(results_by_difficulty.items()):
avg = statistics.mean(rewards) if rewards else 0
mx = max(rewards) if rewards else 0
mn = min(rewards) if rewards else 0
print(f" {diff:6s} | n={len(rewards):3d} | avg={avg:.3f} | max={mx:.3f} | min={mn:.3f}")
print("\n[Evaluate] Done. Use these numbers for your reward curve plots.")
if __name__ == "__main__":
main()