Spaces:
Sleeping
Sleeping
File size: 4,796 Bytes
4156f51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | """
scripts/evaluate.py
Evaluation script β runs N episodes and computes reward statistics.
Use this to generate the reward curves required by the judging criteria.
Usage:
python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50
"""
import argparse
import json
import random
import requests
import statistics
from collections import defaultdict
SAMPLE_TASKS = [
{
"instruction": "Find all travel and ride receipts from Gmail in the last 10 days.",
"difficulty": "easy",
"available_sources": ["gmail"],
"ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50},
},
{
"instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.",
"difficulty": "medium",
"available_sources": ["gmail"],
"ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00},
},
{
"instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.",
"difficulty": "hard",
"available_sources": ["gmail"],
"ground_truth": {"answer": None},
},
]
# Simulated agent policy (replace with actual model inference in production)
def simulated_agent_step(obs: dict, step: int) -> dict:
"""A rule-based agent for evaluation baselines."""
if step == 0:
return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"}
elif step == 1:
src = obs["available_sources"][0]
return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
elif step == 2 and len(obs["available_sources"]) > 1:
src = obs["available_sources"][1]
return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
elif step == 3:
return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"}
elif step == 4:
return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"}
else:
return {
"action_type": "ANSWER",
"content": (
f"Based on analysis of {', '.join(obs['available_sources'])}, "
f"I found relevant records matching the query: {obs['instruction'][:80]}. "
f"Summary: Retrieved and verified data across all available sources."
),
}
def run_episode(env_url: str, task: dict) -> dict:
"""Run a single episode and return reward statistics."""
# Reset
resp = requests.post(f"{env_url}/reset", json=task, timeout=10)
if resp.status_code != 200:
return {"error": resp.text}
obs = resp.json()
total_reward = 0.0
step_rewards = []
done = False
step = 0
while not done and step < 10:
action = simulated_agent_step(obs, step)
resp = requests.post(f"{env_url}/step", json=action, timeout=10)
if resp.status_code != 200:
break
data = resp.json()
reward = data["reward"]
done = data["done"]
obs = data["observation"]
total_reward += reward
step_rewards.append(reward)
step += 1
return {
"difficulty": task["difficulty"],
"total_reward": total_reward,
"steps": step,
"step_rewards": step_rewards,
"done": done,
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--env-url", default="http://localhost:8000")
parser.add_argument("--n-episodes", type=int, default=30)
args = parser.parse_args()
print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}")
results_by_difficulty = defaultdict(list)
for i in range(args.n_episodes):
task = random.choice(SAMPLE_TASKS).copy()
result = run_episode(args.env_url, task)
if "error" not in result:
results_by_difficulty[result["difficulty"]].append(result["total_reward"])
print(f" Episode {i+1:3d} | {task['difficulty']:6s} | reward={result.get('total_reward', 0):.3f}")
print("\nββ Results ββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
for diff, rewards in sorted(results_by_difficulty.items()):
avg = statistics.mean(rewards) if rewards else 0
mx = max(rewards) if rewards else 0
mn = min(rewards) if rewards else 0
print(f" {diff:6s} | n={len(rewards):3d} | avg={avg:.3f} | max={mx:.3f} | min={mn:.3f}")
print("\n[Evaluate] Done. Use these numbers for your reward curve plots.")
if __name__ == "__main__":
main()
|