argus-mlops / tests /test_dashboard.py
hodfa840's picture
Fix RMSE baseline hline and R2 axis clipping on Overview chart
baf1ee1
"""Dashboard diagnostic tests.
Covers two layers:
1. Unit tests — data-loading functions work independently of Streamlit
2. Selenium UI tests — the rendered dashboard shows the expected elements
Usage:
pytest tests/test_dashboard.py -v
pytest tests/test_dashboard.py -v -k unit # unit tests only
pytest tests/test_dashboard.py -v -k selenium # UI tests only (needs running dashboard)
The Selenium tests expect the Streamlit dashboard at DASHBOARD_URL (default
http://localhost:8501). Start it first with:
.venv/Scripts/python run.py --no-sim
"""
from __future__ import annotations
import json
import os
import time
from pathlib import Path
import pandas as pd
import pytest
ROOT = Path(__file__).resolve().parent.parent
DASHBOARD_URL = os.environ.get("DASHBOARD_URL", "http://localhost:8501")
LOG_PATHS = {
"performance": ROOT / "data" / "logs" / "performance.jsonl",
"drift": ROOT / "data" / "logs" / "drift_reports.jsonl",
"retrain": ROOT / "data" / "logs" / "retraining.jsonl",
"predictions": ROOT / "data" / "logs" / "predictions.jsonl",
}
# ---------------------------------------------------------------------------
# Helpers (mirror dashboard logic without Streamlit dependency)
# ---------------------------------------------------------------------------
def _load_jsonl(path: Path, limit: int = 2000) -> pd.DataFrame:
if not path.exists():
return pd.DataFrame()
lines = path.read_text(encoding="utf-8").splitlines()[-limit:]
records = []
for line in lines:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return pd.DataFrame(records) if records else pd.DataFrame()
# ---------------------------------------------------------------------------
# Unit tests
# ---------------------------------------------------------------------------
class TestDataLoading:
"""Verify that log files exist and load correctly."""
def test_performance_file_exists(self):
assert LOG_PATHS["performance"].exists(), (
"performance.jsonl not found — run the simulation first: "
"python scripts/simulate_drift.py"
)
def test_performance_file_has_rmse_column(self):
df = _load_jsonl(LOG_PATHS["performance"])
assert not df.empty, "performance.jsonl is empty"
assert "rmse" in df.columns, f"Expected 'rmse' column; got {list(df.columns)}"
def test_performance_rmse_values_are_positive(self):
df = _load_jsonl(LOG_PATHS["performance"])
if df.empty:
pytest.skip("No performance data yet")
assert (df["rmse"] > 0).all(), "RMSE values must be positive"
def test_performance_file_has_required_columns(self):
df = _load_jsonl(LOG_PATHS["performance"])
if df.empty:
pytest.skip("No performance data yet")
required = {"rmse", "mae", "r2", "n_samples", "timestamp"}
missing = required - set(df.columns)
assert not missing, f"Missing columns in performance log: {missing}"
def test_predictions_file_exists_and_has_data(self):
assert LOG_PATHS["predictions"].exists(), "predictions.jsonl not found"
df = _load_jsonl(LOG_PATHS["predictions"])
assert not df.empty, "predictions.jsonl is empty"
def test_drift_file_structure(self):
if not LOG_PATHS["drift"].exists():
pytest.skip("No drift reports yet")
df = _load_jsonl(LOG_PATHS["drift"])
assert not df.empty
assert "drift_detected" in df.columns, (
f"Expected 'drift_detected'; got {list(df.columns)}"
)
def test_path_resolution_is_correct(self):
"""PROJECT_ROOT computed from dashboard/app.py must point to repo root."""
dashboard_file = ROOT / "dashboard" / "app.py"
resolved_root = dashboard_file.resolve().parent.parent
assert resolved_root == ROOT.resolve(), (
f"Path mismatch: dashboard resolves to {resolved_root}, "
f"expected {ROOT.resolve()}"
)
def test_load_jsonl_returns_dataframe_not_empty_when_file_has_data(self):
path = LOG_PATHS["performance"]
if not path.exists():
pytest.skip("No performance data yet")
df = _load_jsonl(path)
assert isinstance(df, pd.DataFrame)
assert not df.empty
assert len(df) > 0
def test_load_jsonl_handles_missing_file_gracefully(self):
df = _load_jsonl(ROOT / "data" / "logs" / "nonexistent.jsonl")
assert isinstance(df, pd.DataFrame)
assert df.empty
def test_load_jsonl_handles_corrupted_lines_gracefully(self):
import tempfile
with tempfile.NamedTemporaryFile(
mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
) as f:
f.write('{"rmse": 1.5, "mae": 1.2}\n')
f.write("NOT JSON\n")
f.write('{"rmse": 1.6, "mae": 1.3}\n')
tmp = Path(f.name)
try:
df = _load_jsonl(tmp)
assert len(df) == 2, "Should skip corrupted lines and keep valid ones"
assert list(df["rmse"]) == [1.5, 1.6]
finally:
tmp.unlink()
# ---------------------------------------------------------------------------
# Selenium UI tests
# ---------------------------------------------------------------------------
def _get_driver():
"""Return a headless Chrome driver via webdriver-manager."""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1600,900")
service = Service(ChromeDriverManager().install())
return webdriver.Chrome(service=service, options=opts)
def _dashboard_reachable() -> bool:
try:
import urllib.request
urllib.request.urlopen(DASHBOARD_URL, timeout=3)
return True
except Exception:
return False
@pytest.fixture(scope="module")
def driver():
if not _dashboard_reachable():
pytest.skip(f"Dashboard not running at {DASHBOARD_URL}")
drv = _get_driver()
drv.get(DASHBOARD_URL)
time.sleep(6)
yield drv
drv.quit()
@pytest.mark.selenium
class TestDashboardUI:
"""Selenium tests against the live Streamlit dashboard."""
def test_page_title_is_argus(self, driver):
assert "Argus" in driver.title, (
f"Expected 'Argus' in page title, got: {driver.title!r}"
)
def test_sidebar_is_visible(self, driver):
from selenium.webdriver.common.by import By
sidebar = driver.find_elements(By.CSS_SELECTOR, "[data-testid='stSidebar']")
assert sidebar, "Sidebar element not found"
def test_api_status_shown_in_sidebar(self, driver):
from selenium.webdriver.common.by import By
body_text = driver.find_element(By.TAG_NAME, "body").text
assert any(kw in body_text for kw in ("API Online", "API Offline")), (
"Expected API status badge in sidebar"
)
def test_navigation_pages_present(self, driver):
from selenium.webdriver.common.by import By
body_text = driver.find_element(By.TAG_NAME, "body").text
for page in ("Overview", "Drift Analysis", "Feature Insights",
"Retraining Log", "Live Demo"):
assert page in body_text, f"Navigation option '{page}' not found"
def test_overview_metrics_rendered(self, driver):
from selenium.webdriver.common.by import By
body_text = driver.find_element(By.TAG_NAME, "body").text
for label in ("Rolling RMSE", "Baseline RMSE", "Labeled Samples"):
assert label in body_text, f"Metric '{label}' not visible on Overview"
def test_no_python_traceback_on_page(self, driver):
from selenium.webdriver.common.by import By
body_text = driver.find_element(By.TAG_NAME, "body").text
assert "Traceback (most recent call last)" not in body_text, (
"Python traceback found on dashboard page"
)
def test_chart_renders_when_data_present(self, driver):
"""If performance data exists, the RMSE chart must be visible (not 'No data')."""
if not LOG_PATHS["performance"].exists():
pytest.skip("No performance data — chart absence is expected")
df = _load_jsonl(LOG_PATHS["performance"])
if df.empty:
pytest.skip("performance.jsonl is empty — chart absence is expected")
from selenium.webdriver.common.by import By
body_text = driver.find_element(By.TAG_NAME, "body").text
no_data_msg = "No performance data yet"
assert no_data_msg not in body_text, (
f"Dashboard shows '{no_data_msg}' but performance.jsonl has "
f"{len(df)} rows. Root cause: auto-refresh clears the cache "
"BEFORE chart code runs, causing an infinite blank loop."
)
def test_refresh_now_button_exists(self, driver):
from selenium.webdriver.common.by import By
buttons = driver.find_elements(By.TAG_NAME, "button")
labels = [b.text.strip() for b in buttons]
assert "Refresh Now" in labels, (
f"'Refresh Now' button not found. Available buttons: {labels}"
)
def test_clicking_refresh_loads_chart(self, driver):
"""Click Refresh Now and verify the chart appears within 10 seconds."""
if not LOG_PATHS["performance"].exists():
pytest.skip("No performance data")
df = _load_jsonl(LOG_PATHS["performance"])
if df.empty:
pytest.skip("performance.jsonl is empty")
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
buttons = driver.find_elements(By.TAG_NAME, "button")
for btn in buttons:
if btn.text.strip() == "Refresh Now":
btn.click()
break
time.sleep(8)
body_text = driver.find_element(By.TAG_NAME, "body").text
no_data_msg = "No performance data yet"
assert no_data_msg not in body_text, (
"Chart still absent after clicking Refresh Now"
)
def test_screenshot_on_failure(self, driver, request):
"""Save a screenshot to assets/test_screenshot.png for inspection."""
screenshot_path = ROOT / "assets" / "test_screenshot.png"
driver.save_screenshot(str(screenshot_path))
# ---------------------------------------------------------------------------
# Unit tests: fix #1 — baseline RMSE must not use iloc[0] from the log
# ---------------------------------------------------------------------------
class TestBaselineRmseLogic:
"""
Verify that the baseline hline calculation uses api_metrics baseline_rmse
rather than the first row of the performance log.
Before the fix: bsl = perf_df["rmse"].iloc[0]
After the fix: bsl = baseline or perf_df["rmse"].min()
If the log starts mid-drift (high RMSE), iloc[0] would have been wrong.
"""
def _bsl(self, api_baseline, perf_rmse_values: list) -> float:
"""Replicate the fixed dashboard bsl calculation."""
import numpy as np
df = pd.DataFrame({"rmse": perf_rmse_values})
baseline = api_baseline
return baseline if baseline else float(df["rmse"].min())
def test_uses_api_baseline_when_available(self):
# Log starts at a high value (simulating mid-drift start)
rmse_series = [10.5, 10.8, 11.2, 11.0, 10.9]
bsl = self._bsl(api_baseline=2.1, perf_rmse_values=rmse_series)
assert bsl == 2.1, (
f"Expected api baseline 2.1, got {bsl}. "
"Fix is not applied: bsl must come from api_metrics, not iloc[0]."
)
def test_falls_back_to_min_when_api_unavailable(self):
rmse_series = [1.8, 2.1, 5.3, 9.0, 3.2]
bsl = self._bsl(api_baseline=None, perf_rmse_values=rmse_series)
assert bsl == 1.8, (
f"Fallback should be min(rmse)=1.8, got {bsl}."
)
def test_old_iloc0_would_have_failed_mid_drift(self):
"""Demonstrate the old bug: iloc[0] on a mid-drift log gives wrong baseline."""
rmse_series = [10.5, 10.8, 11.2, 11.0, 10.9]
df = pd.DataFrame({"rmse": rmse_series})
old_bsl = df["rmse"].iloc[0] # old (broken) logic
assert old_bsl == 10.5, "Setup check: old logic picks high value"
# The old bsl would set the baseline hline at 10.5 instead of ~2.1,
# causing the chart to look flat (everything near or above "baseline")
assert old_bsl > 5.0, (
"Old baseline would have been unreasonably high — confirms the bug."
)
def test_alert_threshold_is_correct_fraction_of_bsl(self):
"""Alert hline must be 15% above baseline."""
bsl = 2.131
alert = bsl * 1.15
assert abs(alert - 2.451) < 0.01, f"Alert threshold wrong: {alert:.3f}"
# ---------------------------------------------------------------------------
# Unit tests: fix #2 — R² y-axis must accommodate negative values
# ---------------------------------------------------------------------------
class TestR2AxisScaling:
"""
Verify that the R² chart y-axis lower bound scales to include negative R²
instead of clipping at 0.
Before the fix: range=[0, 1.05] (negative values invisible)
After the fix: range=[r2_floor, 1.05] where r2_floor < 0 when data dips negative
"""
def _r2_floor(self, r2_values: list) -> float:
"""Replicate the fixed dashboard r2_floor calculation."""
r2_min = min(r2_values)
return min(r2_min - 0.05, -0.1) if r2_min < 0 else -0.05
def test_negative_r2_produces_negative_floor(self):
r2_series = [0.91, 0.60, -0.49, -1.22, 0.83]
floor = self._r2_floor(r2_series)
assert floor < 0, f"r2_floor must be negative when data goes below 0, got {floor}"
assert floor <= -1.22 - 0.05, (
f"Floor {floor} is not low enough to show min r2=-1.22 "
"(should be min - 0.05 = -1.27)"
)
def test_all_positive_r2_uses_small_negative_floor(self):
r2_series = [0.91, 0.88, 0.93, 0.85]
floor = self._r2_floor(r2_series)
assert floor == -0.05, (
f"When all R² > 0, floor should be -0.05 for breathing room, got {floor}"
)
def test_floor_is_below_min_r2(self):
"""Floor must always be below the minimum R² value so no data is clipped."""
for min_r2 in [-0.05, -0.5, -1.0, -1.22]:
r2_series = [0.9, min_r2]
floor = self._r2_floor(r2_series)
assert floor <= min_r2, (
f"At min_r2={min_r2}, floor={floor} clips data (must be <= min_r2)"
)
def test_old_hardcoded_range_clipped_negative_r2(self):
"""Show that the old range=[0, 1.05] would have hidden the negative data."""
old_range_min = 0
r2_min_in_data = -1.22
assert r2_min_in_data < old_range_min, (
"Confirms bug: min R² in data is below old y-axis floor of 0"
)
# ---------------------------------------------------------------------------
# Selenium: verify chart renders correctly with fixed logic
# ---------------------------------------------------------------------------
@pytest.mark.selenium
class TestChartFixes:
"""End-to-end Selenium tests verifying the two chart fixes in production."""
def test_overview_chart_section_visible(self, driver):
from selenium.webdriver.common.by import By
body = driver.find_element(By.TAG_NAME, "body").text
assert "Prediction Error Over Time" in body, (
"RMSE chart section heading not visible on Overview"
)
def test_baseline_annotation_present_in_chart(self, driver):
"""
The 'Baseline' hline annotation must appear in the rendered SVG.
If bsl was computed from a high iloc[0], the annotation would still
appear but at the wrong Y level — this confirms it's rendered at all.
"""
from selenium.webdriver.common.by import By
page_source = driver.page_source
assert "Baseline" in page_source, (
"Baseline annotation not found in rendered page source. "
"Chart may not have rendered."
)
def test_alert_annotation_present_in_chart(self, driver):
from selenium.webdriver.common.by import By
page_source = driver.page_source
assert "Alert" in page_source or "+15%" in page_source, (
"Alert +15% annotation not found in rendered chart."
)
def test_r2_chart_section_visible(self, driver):
from selenium.webdriver.common.by import By
page_source = driver.page_source
# R² label should appear as an axis title in the SVG
assert "R²" in page_source or "R\u00b2" in page_source, (
"R² chart axis label not found — chart may not have rendered."
)
def test_no_traceback_on_overview(self, driver):
from selenium.webdriver.common.by import By
assert "Traceback (most recent call last)" not in \
driver.find_element(By.TAG_NAME, "body").text
def test_overview_screenshot_with_fixes(self, driver):
"""Save a screenshot showing the fixed chart for visual verification."""
screenshot_path = ROOT / "assets" / "overview_chart_fixed.png"
driver.save_screenshot(str(screenshot_path))