trace

Sleeping

Ayush

Update

4156f51 about 2 months ago

4.8 kB

	"""
	scripts/evaluate.py

	Evaluation script — runs N episodes and computes reward statistics.
	Use this to generate the reward curves required by the judging criteria.

	Usage:
	python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50
	"""

	import argparse
	import json
	import random
	import requests
	import statistics
	from collections import defaultdict


	SAMPLE_TASKS = [
	{
	"instruction": "Find all travel and ride receipts from Gmail in the last 10 days.",
	"difficulty": "easy",
	"available_sources": ["gmail"],
	"ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50},
	},
	{
	"instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.",
	"difficulty": "medium",
	"available_sources": ["gmail"],
	"ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00},
	},
	{
	"instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.",
	"difficulty": "hard",
	"available_sources": ["gmail"],
	"ground_truth": {"answer": None},
	},
	]

	# Simulated agent policy (replace with actual model inference in production)
	def simulated_agent_step(obs: dict, step: int) -> dict:
	"""A rule-based agent for evaluation baselines."""
	if step == 0:
	return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"}
	elif step == 1:
	src = obs["available_sources"][0]
	return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
	elif step == 2 and len(obs["available_sources"]) > 1:
	src = obs["available_sources"][1]
	return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
	elif step == 3:
	return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"}
	elif step == 4:
	return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"}
	else:
	return {
	"action_type": "ANSWER",
	"content": (
	f"Based on analysis of {', '.join(obs['available_sources'])}, "
	f"I found relevant records matching the query: {obs['instruction'][:80]}. "
	f"Summary: Retrieved and verified data across all available sources."
	),
	}


	def run_episode(env_url: str, task: dict) -> dict:
	"""Run a single episode and return reward statistics."""
	# Reset
	resp = requests.post(f"{env_url}/reset", json=task, timeout=10)
	if resp.status_code != 200:
	return {"error": resp.text}

	obs = resp.json()
	total_reward = 0.0
	step_rewards = []
	done = False
	step = 0

	while not done and step < 10:
	action = simulated_agent_step(obs, step)

	resp = requests.post(f"{env_url}/step", json=action, timeout=10)
	if resp.status_code != 200:
	break

	data = resp.json()
	reward = data["reward"]
	done = data["done"]
	obs = data["observation"]

	total_reward += reward
	step_rewards.append(reward)
	step += 1

	return {
	"difficulty": task["difficulty"],
	"total_reward": total_reward,
	"steps": step,
	"step_rewards": step_rewards,
	"done": done,
	}


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--env-url", default="http://localhost:8000")
	parser.add_argument("--n-episodes", type=int, default=30)
	args = parser.parse_args()

	print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}")

	results_by_difficulty = defaultdict(list)

	for i in range(args.n_episodes):
	task = random.choice(SAMPLE_TASKS).copy()
	result = run_episode(args.env_url, task)
	if "error" not in result:
	results_by_difficulty[result["difficulty"]].append(result["total_reward"])
	print(f" Episode {i+1:3d} \| {task['difficulty']:6s} \| reward={result.get('total_reward', 0):.3f}")

	print("\n── Results ──────────────────────────────────────────────────────")
	for diff, rewards in sorted(results_by_difficulty.items()):
	avg = statistics.mean(rewards) if rewards else 0
	mx = max(rewards) if rewards else 0
	mn = min(rewards) if rewards else 0
	print(f" {diff:6s} \| n={len(rewards):3d} \| avg={avg:.3f} \| max={mx:.3f} \| min={mn:.3f}")

	print("\n[Evaluate] Done. Use these numbers for your reward curve plots.")


	if __name__ == "__main__":
	main()