File size: 4,796 Bytes
4156f51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
scripts/evaluate.py

Evaluation script β€” runs N episodes and computes reward statistics.
Use this to generate the reward curves required by the judging criteria.

Usage:
    python scripts/evaluate.py --env-url http://localhost:8000 --n-episodes 50
"""

import argparse
import json
import random
import requests
import statistics
from collections import defaultdict


SAMPLE_TASKS = [
    {
        "instruction": "Find all travel and ride receipts from Gmail in the last 10 days.",
        "difficulty": "easy",
        "available_sources": ["gmail"],
        "ground_truth": {"answer": "Found ride receipts totaling", "expected_numeric_target": 120.50},
    },
    {
        "instruction": "Audit all ride receipts from Gmail between 2022 and 2023, and calculate total spend.",
        "difficulty": "medium",
        "available_sources": ["gmail"],
        "ground_truth": {"answer": "Total ride spend", "expected_numeric_target": 1500.00},
    },
    {
        "instruction": "Perform a full financial audit of travel and ride footprint from 2022-2024, flag missing receipts.",
        "difficulty": "hard",
        "available_sources": ["gmail"],
        "ground_truth": {"answer": None},
    },
]

# Simulated agent policy (replace with actual model inference in production)
def simulated_agent_step(obs: dict, step: int) -> dict:
    """A rule-based agent for evaluation baselines."""
    if step == 0:
        return {"action_type": "PLAN", "content": f"Plan to retrieve and analyze: {obs['instruction'][:60]}"}
    elif step == 1:
        src = obs["available_sources"][0]
        return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
    elif step == 2 and len(obs["available_sources"]) > 1:
        src = obs["available_sources"][1]
        return {"action_type": "RETRIEVE", "content": obs["instruction"][:50], "source": src}
    elif step == 3:
        return {"action_type": "MEMORIZE", "content": f"Retrieved data from {obs['available_sources']}"}
    elif step == 4:
        return {"action_type": "VERIFY", "content": f"Verifying findings for: {obs['instruction'][:60]}"}
    else:
        return {
            "action_type": "ANSWER",
            "content": (
                f"Based on analysis of {', '.join(obs['available_sources'])}, "
                f"I found relevant records matching the query: {obs['instruction'][:80]}. "
                f"Summary: Retrieved and verified data across all available sources."
            ),
        }


def run_episode(env_url: str, task: dict) -> dict:
    """Run a single episode and return reward statistics."""
    # Reset
    resp = requests.post(f"{env_url}/reset", json=task, timeout=10)
    if resp.status_code != 200:
        return {"error": resp.text}

    obs = resp.json()
    total_reward = 0.0
    step_rewards = []
    done = False
    step = 0

    while not done and step < 10:
        action = simulated_agent_step(obs, step)

        resp = requests.post(f"{env_url}/step", json=action, timeout=10)
        if resp.status_code != 200:
            break

        data = resp.json()
        reward = data["reward"]
        done = data["done"]
        obs = data["observation"]

        total_reward += reward
        step_rewards.append(reward)
        step += 1

    return {
        "difficulty": task["difficulty"],
        "total_reward": total_reward,
        "steps": step,
        "step_rewards": step_rewards,
        "done": done,
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-url", default="http://localhost:8000")
    parser.add_argument("--n-episodes", type=int, default=30)
    args = parser.parse_args()

    print(f"[Evaluate] Running {args.n_episodes} episodes against {args.env_url}")

    results_by_difficulty = defaultdict(list)

    for i in range(args.n_episodes):
        task = random.choice(SAMPLE_TASKS).copy()
        result = run_episode(args.env_url, task)
        if "error" not in result:
            results_by_difficulty[result["difficulty"]].append(result["total_reward"])
        print(f"  Episode {i+1:3d} | {task['difficulty']:6s} | reward={result.get('total_reward', 0):.3f}")

    print("\n── Results ──────────────────────────────────────────────────────")
    for diff, rewards in sorted(results_by_difficulty.items()):
        avg = statistics.mean(rewards) if rewards else 0
        mx = max(rewards) if rewards else 0
        mn = min(rewards) if rewards else 0
        print(f"  {diff:6s} | n={len(rewards):3d} | avg={avg:.3f} | max={mx:.3f} | min={mn:.3f}")

    print("\n[Evaluate] Done. Use these numbers for your reward curve plots.")


if __name__ == "__main__":
    main()