Spaces:
Sleeping
Sleeping
Fix RMSE baseline hline and R2 axis clipping on Overview chart
Browse files- bsl now uses api_metrics baseline_rmse instead of perf_df["rmse"].iloc[0],
which was picking up a high mid-drift value and flattening the visual scale
- R2 y-axis range now scales dynamically to include negative values (was
hard-coded [0, 1.05], clipping data points during peak degradation)
- Add unit + Selenium tests for both fixes (34 tests, all passing locally)
- dashboard/app.py +4 -2
- tests/test_dashboard.py +163 -0
dashboard/app.py
CHANGED
|
@@ -444,7 +444,7 @@ if page == "Overview":
|
|
| 444 |
|
| 445 |
if not perf_df.empty and "rmse" in perf_df.columns:
|
| 446 |
perf_df["idx"] = range(len(perf_df))
|
| 447 |
-
bsl = perf_df["rmse"].
|
| 448 |
|
| 449 |
fig = go.Figure()
|
| 450 |
fig.add_trace(go.Scatter(
|
|
@@ -507,11 +507,13 @@ if page == "Overview":
|
|
| 507 |
annotation_text="Quality floor (0.80)",
|
| 508 |
annotation_position="bottom right",
|
| 509 |
annotation_font_color=WARN)
|
|
|
|
|
|
|
| 510 |
fig2.update_layout(
|
| 511 |
**_plotly_layout(
|
| 512 |
height=160,
|
| 513 |
xaxis=dict(title="", gridcolor=BORDER),
|
| 514 |
-
yaxis=dict(title="R\u00b2", gridcolor=BORDER, range=[
|
| 515 |
showlegend=False,
|
| 516 |
)
|
| 517 |
)
|
|
|
|
| 444 |
|
| 445 |
if not perf_df.empty and "rmse" in perf_df.columns:
|
| 446 |
perf_df["idx"] = range(len(perf_df))
|
| 447 |
+
bsl = baseline or perf_df["rmse"].min()
|
| 448 |
|
| 449 |
fig = go.Figure()
|
| 450 |
fig.add_trace(go.Scatter(
|
|
|
|
| 507 |
annotation_text="Quality floor (0.80)",
|
| 508 |
annotation_position="bottom right",
|
| 509 |
annotation_font_color=WARN)
|
| 510 |
+
r2_min = float(perf_df["r2"].min())
|
| 511 |
+
r2_floor = min(r2_min - 0.05, -0.1) if r2_min < 0 else -0.05
|
| 512 |
fig2.update_layout(
|
| 513 |
**_plotly_layout(
|
| 514 |
height=160,
|
| 515 |
xaxis=dict(title="", gridcolor=BORDER),
|
| 516 |
+
yaxis=dict(title="R\u00b2", gridcolor=BORDER, range=[r2_floor, 1.05]),
|
| 517 |
showlegend=False,
|
| 518 |
)
|
| 519 |
)
|
tests/test_dashboard.py
CHANGED
|
@@ -272,3 +272,166 @@ class TestDashboardUI:
|
|
| 272 |
"""Save a screenshot to assets/test_screenshot.png for inspection."""
|
| 273 |
screenshot_path = ROOT / "assets" / "test_screenshot.png"
|
| 274 |
driver.save_screenshot(str(screenshot_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
"""Save a screenshot to assets/test_screenshot.png for inspection."""
|
| 273 |
screenshot_path = ROOT / "assets" / "test_screenshot.png"
|
| 274 |
driver.save_screenshot(str(screenshot_path))
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# ---------------------------------------------------------------------------
|
| 278 |
+
# Unit tests: fix #1 — baseline RMSE must not use iloc[0] from the log
|
| 279 |
+
# ---------------------------------------------------------------------------
|
| 280 |
+
|
| 281 |
+
class TestBaselineRmseLogic:
|
| 282 |
+
"""
|
| 283 |
+
Verify that the baseline hline calculation uses api_metrics baseline_rmse
|
| 284 |
+
rather than the first row of the performance log.
|
| 285 |
+
|
| 286 |
+
Before the fix: bsl = perf_df["rmse"].iloc[0]
|
| 287 |
+
After the fix: bsl = baseline or perf_df["rmse"].min()
|
| 288 |
+
|
| 289 |
+
If the log starts mid-drift (high RMSE), iloc[0] would have been wrong.
|
| 290 |
+
"""
|
| 291 |
+
|
| 292 |
+
def _bsl(self, api_baseline, perf_rmse_values: list) -> float:
|
| 293 |
+
"""Replicate the fixed dashboard bsl calculation."""
|
| 294 |
+
import numpy as np
|
| 295 |
+
df = pd.DataFrame({"rmse": perf_rmse_values})
|
| 296 |
+
baseline = api_baseline
|
| 297 |
+
return baseline if baseline else float(df["rmse"].min())
|
| 298 |
+
|
| 299 |
+
def test_uses_api_baseline_when_available(self):
|
| 300 |
+
# Log starts at a high value (simulating mid-drift start)
|
| 301 |
+
rmse_series = [10.5, 10.8, 11.2, 11.0, 10.9]
|
| 302 |
+
bsl = self._bsl(api_baseline=2.1, perf_rmse_values=rmse_series)
|
| 303 |
+
assert bsl == 2.1, (
|
| 304 |
+
f"Expected api baseline 2.1, got {bsl}. "
|
| 305 |
+
"Fix is not applied: bsl must come from api_metrics, not iloc[0]."
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
def test_falls_back_to_min_when_api_unavailable(self):
|
| 309 |
+
rmse_series = [1.8, 2.1, 5.3, 9.0, 3.2]
|
| 310 |
+
bsl = self._bsl(api_baseline=None, perf_rmse_values=rmse_series)
|
| 311 |
+
assert bsl == 1.8, (
|
| 312 |
+
f"Fallback should be min(rmse)=1.8, got {bsl}."
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
def test_old_iloc0_would_have_failed_mid_drift(self):
|
| 316 |
+
"""Demonstrate the old bug: iloc[0] on a mid-drift log gives wrong baseline."""
|
| 317 |
+
rmse_series = [10.5, 10.8, 11.2, 11.0, 10.9]
|
| 318 |
+
df = pd.DataFrame({"rmse": rmse_series})
|
| 319 |
+
old_bsl = df["rmse"].iloc[0] # old (broken) logic
|
| 320 |
+
assert old_bsl == 10.5, "Setup check: old logic picks high value"
|
| 321 |
+
# The old bsl would set the baseline hline at 10.5 instead of ~2.1,
|
| 322 |
+
# causing the chart to look flat (everything near or above "baseline")
|
| 323 |
+
assert old_bsl > 5.0, (
|
| 324 |
+
"Old baseline would have been unreasonably high — confirms the bug."
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
def test_alert_threshold_is_correct_fraction_of_bsl(self):
|
| 328 |
+
"""Alert hline must be 15% above baseline."""
|
| 329 |
+
bsl = 2.131
|
| 330 |
+
alert = bsl * 1.15
|
| 331 |
+
assert abs(alert - 2.451) < 0.01, f"Alert threshold wrong: {alert:.3f}"
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
# ---------------------------------------------------------------------------
|
| 335 |
+
# Unit tests: fix #2 — R² y-axis must accommodate negative values
|
| 336 |
+
# ---------------------------------------------------------------------------
|
| 337 |
+
|
| 338 |
+
class TestR2AxisScaling:
|
| 339 |
+
"""
|
| 340 |
+
Verify that the R² chart y-axis lower bound scales to include negative R²
|
| 341 |
+
instead of clipping at 0.
|
| 342 |
+
|
| 343 |
+
Before the fix: range=[0, 1.05] (negative values invisible)
|
| 344 |
+
After the fix: range=[r2_floor, 1.05] where r2_floor < 0 when data dips negative
|
| 345 |
+
"""
|
| 346 |
+
|
| 347 |
+
def _r2_floor(self, r2_values: list) -> float:
|
| 348 |
+
"""Replicate the fixed dashboard r2_floor calculation."""
|
| 349 |
+
r2_min = min(r2_values)
|
| 350 |
+
return min(r2_min - 0.05, -0.1) if r2_min < 0 else -0.05
|
| 351 |
+
|
| 352 |
+
def test_negative_r2_produces_negative_floor(self):
|
| 353 |
+
r2_series = [0.91, 0.60, -0.49, -1.22, 0.83]
|
| 354 |
+
floor = self._r2_floor(r2_series)
|
| 355 |
+
assert floor < 0, f"r2_floor must be negative when data goes below 0, got {floor}"
|
| 356 |
+
assert floor <= -1.22 - 0.05, (
|
| 357 |
+
f"Floor {floor} is not low enough to show min r2=-1.22 "
|
| 358 |
+
"(should be min - 0.05 = -1.27)"
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
def test_all_positive_r2_uses_small_negative_floor(self):
|
| 362 |
+
r2_series = [0.91, 0.88, 0.93, 0.85]
|
| 363 |
+
floor = self._r2_floor(r2_series)
|
| 364 |
+
assert floor == -0.05, (
|
| 365 |
+
f"When all R² > 0, floor should be -0.05 for breathing room, got {floor}"
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
def test_floor_is_below_min_r2(self):
|
| 369 |
+
"""Floor must always be below the minimum R² value so no data is clipped."""
|
| 370 |
+
for min_r2 in [-0.05, -0.5, -1.0, -1.22]:
|
| 371 |
+
r2_series = [0.9, min_r2]
|
| 372 |
+
floor = self._r2_floor(r2_series)
|
| 373 |
+
assert floor <= min_r2, (
|
| 374 |
+
f"At min_r2={min_r2}, floor={floor} clips data (must be <= min_r2)"
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
def test_old_hardcoded_range_clipped_negative_r2(self):
|
| 378 |
+
"""Show that the old range=[0, 1.05] would have hidden the negative data."""
|
| 379 |
+
old_range_min = 0
|
| 380 |
+
r2_min_in_data = -1.22
|
| 381 |
+
assert r2_min_in_data < old_range_min, (
|
| 382 |
+
"Confirms bug: min R² in data is below old y-axis floor of 0"
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# ---------------------------------------------------------------------------
|
| 387 |
+
# Selenium: verify chart renders correctly with fixed logic
|
| 388 |
+
# ---------------------------------------------------------------------------
|
| 389 |
+
|
| 390 |
+
@pytest.mark.selenium
|
| 391 |
+
class TestChartFixes:
|
| 392 |
+
"""End-to-end Selenium tests verifying the two chart fixes in production."""
|
| 393 |
+
|
| 394 |
+
def test_overview_chart_section_visible(self, driver):
|
| 395 |
+
from selenium.webdriver.common.by import By
|
| 396 |
+
body = driver.find_element(By.TAG_NAME, "body").text
|
| 397 |
+
assert "Prediction Error Over Time" in body, (
|
| 398 |
+
"RMSE chart section heading not visible on Overview"
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
def test_baseline_annotation_present_in_chart(self, driver):
|
| 402 |
+
"""
|
| 403 |
+
The 'Baseline' hline annotation must appear in the rendered SVG.
|
| 404 |
+
If bsl was computed from a high iloc[0], the annotation would still
|
| 405 |
+
appear but at the wrong Y level — this confirms it's rendered at all.
|
| 406 |
+
"""
|
| 407 |
+
from selenium.webdriver.common.by import By
|
| 408 |
+
page_source = driver.page_source
|
| 409 |
+
assert "Baseline" in page_source, (
|
| 410 |
+
"Baseline annotation not found in rendered page source. "
|
| 411 |
+
"Chart may not have rendered."
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
def test_alert_annotation_present_in_chart(self, driver):
|
| 415 |
+
from selenium.webdriver.common.by import By
|
| 416 |
+
page_source = driver.page_source
|
| 417 |
+
assert "Alert" in page_source or "+15%" in page_source, (
|
| 418 |
+
"Alert +15% annotation not found in rendered chart."
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
def test_r2_chart_section_visible(self, driver):
|
| 422 |
+
from selenium.webdriver.common.by import By
|
| 423 |
+
page_source = driver.page_source
|
| 424 |
+
# R² label should appear as an axis title in the SVG
|
| 425 |
+
assert "R²" in page_source or "R\u00b2" in page_source, (
|
| 426 |
+
"R² chart axis label not found — chart may not have rendered."
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
def test_no_traceback_on_overview(self, driver):
|
| 430 |
+
from selenium.webdriver.common.by import By
|
| 431 |
+
assert "Traceback (most recent call last)" not in \
|
| 432 |
+
driver.find_element(By.TAG_NAME, "body").text
|
| 433 |
+
|
| 434 |
+
def test_overview_screenshot_with_fixes(self, driver):
|
| 435 |
+
"""Save a screenshot showing the fixed chart for visual verification."""
|
| 436 |
+
screenshot_path = ROOT / "assets" / "overview_chart_fixed.png"
|
| 437 |
+
driver.save_screenshot(str(screenshot_path))
|