Spaces:

SolarWine
/

api

Running

App Files Files Community

safraeli commited on 6 days ago

Commit

bfbaecb

verified ·

1 Parent(s): 8ff229f

Deploy: data flow monitoring, email alerts, chatbot validation

Browse files

Files changed (7) hide show

backend/api/main.py +34 -0
backend/services/__init__.py +0 -0
backend/services/data_flow_monitor.py +164 -0
backend/services/email_alerter.py +96 -0
config/settings.py +15 -0
src/chatbot/guardrails.py +87 -0
src/chatbot/vineyard_chatbot.py +36 -5

backend/api/main.py CHANGED Viewed

@@ -149,6 +149,38 @@ async def _sensor_refresh_loop(interval_sec: int = 120):
         await asyncio.sleep(interval_sec)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global _start_time
@@ -166,10 +198,12 @@ async def lifespan(app: FastAPI):
     import asyncio
     ims_task = asyncio.create_task(_ims_refresh_loop())
     sensor_task = asyncio.create_task(_sensor_refresh_loop())
     yield
     ims_task.cancel()
     sensor_task.cancel()
     log.info("SolarWine API shutting down (uptime=%.0fs)", get_uptime())

         await asyncio.sleep(interval_sec)
+async def _data_flow_alert_loop(interval_sec: int = 300):
+    """Background loop: check data flow health, send email alerts if red."""
+    import asyncio
+    from backend.services.data_flow_monitor import DataFlowMonitor
+    from backend.services.email_alerter import EmailAlerter
+    from backend.api.events import event_bus
+    await asyncio.sleep(60)  # let startup finish
+    monitor = DataFlowMonitor()
+    alerter = EmailAlerter()
+    if alerter.enabled:
+        log.info("Email alerter active → %s", os.environ.get("ALERT_EMAIL_TO", ""))
+    else:
+        log.info("Email alerter disabled (set SMTP_HOST + ALERT_EMAIL_TO to enable)")
+    while True:
+        try:
+            from backend.api.deps import get_datahub
+            hub = get_datahub()
+            loop = asyncio.get_event_loop()
+            status = await loop.run_in_executor(None, monitor.check_all, hub)
+            # Notify SSE clients so the frontend status indicator updates
+            await event_bus.notify("health")
+            # Send email alerts for red sources
+            alerted = alerter.check_and_alert(status)
+            if alerted:
+                log.warning("Data flow alerts sent for: %s", ", ".join(alerted))
+        except Exception as exc:
+            log.error("Data flow alert check failed: %s", exc)
+        await asyncio.sleep(interval_sec)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global _start_time
     import asyncio
     ims_task = asyncio.create_task(_ims_refresh_loop())
     sensor_task = asyncio.create_task(_sensor_refresh_loop())
+    alert_task = asyncio.create_task(_data_flow_alert_loop())
     yield
     ims_task.cancel()
     sensor_task.cancel()
+    alert_task.cancel()
     log.info("SolarWine API shutting down (uptime=%.0fs)", get_uptime())

backend/services/__init__.py ADDED Viewed

File without changes

backend/services/data_flow_monitor.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Data flow monitor — checks IMS and ThingsBoard data freshness.
+Returns per-source status (green / yellow / red) with age and messages.
+Used by the /api/health/data-sources endpoint and the email alerter.
+"""
+from __future__ import annotations
+import logging
+from datetime import datetime, timezone
+from typing import Any
+from config.settings import (
+    IMS_STALE_YELLOW_MIN,
+    IMS_STALE_RED_MIN,
+    TB_STALE_YELLOW_MIN,
+    TB_STALE_RED_MIN,
+    ENERGY_STALE_YELLOW_MIN,
+    ENERGY_STALE_RED_MIN,
+)
+from src.data.data_providers import DataHub
+log = logging.getLogger("solarwine.monitor")
+def _classify(age_minutes: float | None, yellow: float, red: float) -> str:
+    """Return green / yellow / red based on age thresholds."""
+    if age_minutes is None:
+        return "red"
+    if age_minutes < yellow:
+        return "green"
+    if age_minutes < red:
+        return "yellow"
+    return "red"
+def _status_message(source: str, status: str, age: float | None) -> str:
+    if status == "green":
+        return f"{source} data is fresh"
+    if age is not None:
+        return f"{source} data is {age:.0f} min old"
+    return f"{source} data is unavailable"
+class DataFlowMonitor:
+    """Computes per-source data flow status."""
+    def check_all(self, hub: DataHub) -> dict[str, Any]:
+        now_iso = datetime.now(timezone.utc).isoformat()
+        sources: dict[str, dict] = {}
+        # --- IMS Weather ---
+        sources["ims_weather"] = self._check_ims(hub)
+        # --- ThingsBoard Sensors ---
+        sources["tb_sensors"] = self._check_tb_sensors(hub)
+        # --- ThingsBoard Energy ---
+        sources["tb_energy"] = self._check_tb_energy(hub)
+        # Overall status: worst of all sources
+        statuses = [s["status"] for s in sources.values()]
+        if "red" in statuses:
+            overall = "red"
+        elif "yellow" in statuses:
+            overall = "yellow"
+        else:
+            overall = "green"
+        return {
+            "overall": overall,
+            "checked_at": now_iso,
+            "sources": sources,
+        }
+    def _check_ims(self, hub: DataHub) -> dict:
+        try:
+            wx = hub.weather.get_current()
+            if wx and "error" not in wx:
+                age = float(wx.get("age_minutes", -1))
+                if age < 0:
+                    age = None
+                status = _classify(age, IMS_STALE_YELLOW_MIN, IMS_STALE_RED_MIN)
+                return {
+                    "status": status,
+                    "age_minutes": round(age, 1) if age is not None else None,
+                    "last_reading": wx.get("timestamp_local") or wx.get("timestamp_utc"),
+                    "message": _status_message("IMS weather", status, age),
+                }
+            return {
+                "status": "red",
+                "age_minutes": None,
+                "last_reading": None,
+                "message": f"IMS weather error: {wx.get('error', 'unavailable')}",
+            }
+        except Exception as exc:
+            log.warning("IMS health check failed: %s", exc)
+            return {
+                "status": "red",
+                "age_minutes": None,
+                "last_reading": None,
+                "message": f"IMS weather unreachable: {exc}",
+            }
+    def _check_tb_sensors(self, hub: DataHub) -> dict:
+        try:
+            snap = hub.vine_sensors.get_snapshot(light=True)
+            if snap and "error" not in snap:
+                stale = snap.get("staleness_minutes")
+                age = float(stale) if stale is not None else None
+                status = _classify(age, TB_STALE_YELLOW_MIN, TB_STALE_RED_MIN)
+                return {
+                    "status": status,
+                    "age_minutes": round(age, 1) if age is not None else None,
+                    "last_reading": snap.get("timestamp"),
+                    "message": _status_message("ThingsBoard sensors", status, age),
+                }
+            return {
+                "status": "red",
+                "age_minutes": None,
+                "last_reading": None,
+                "message": f"TB sensors error: {snap.get('error', 'unavailable')}",
+            }
+        except Exception as exc:
+            log.warning("TB sensors health check failed: %s", exc)
+            return {
+                "status": "red",
+                "age_minutes": None,
+                "last_reading": None,
+                "message": f"TB sensors unreachable: {exc}",
+            }
+    def _check_tb_energy(self, hub: DataHub) -> dict:
+        try:
+            en = hub.energy.get_current()
+            if en and "error" not in en:
+                power = en.get("power_kw")
+                # Energy doesn't always expose age_minutes — infer from cache TTL
+                age = float(en["age_minutes"]) if "age_minutes" in en else None
+                status = _classify(age, ENERGY_STALE_YELLOW_MIN, ENERGY_STALE_RED_MIN)
+                if age is None:
+                    # If no age but we have data, assume green (just fetched)
+                    status = "green"
+                return {
+                    "status": status,
+                    "age_minutes": round(age, 1) if age is not None else None,
+                    "power_kw": round(float(power), 2) if power is not None else None,
+                    "message": _status_message("Energy telemetry", status, age),
+                }
+            return {
+                "status": "red",
+                "age_minutes": None,
+                "power_kw": None,
+                "message": f"Energy error: {en.get('error', 'unavailable')}",
+            }
+        except Exception as exc:
+            log.warning("Energy health check failed: %s", exc)
+            return {
+                "status": "red",
+                "age_minutes": None,
+                "power_kw": None,
+                "message": f"Energy unreachable: {exc}",
+            }

backend/services/email_alerter.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+Email alerter — sends notifications when data sources go stale.
+Activated by setting env vars: SMTP_HOST, SMTP_PORT, SMTP_USER, SMTP_PASSWORD, ALERT_EMAIL_TO.
+Respects a per-source cooldown to avoid spamming.
+"""
+from __future__ import annotations
+import logging
+import os
+import smtplib
+import time
+from email.mime.text import MIMEText
+from typing import Any
+from config.settings import ALERT_COOLDOWN_MIN
+log = logging.getLogger("solarwine.alerter")
+class EmailAlerter:
+    """Sends email alerts when data sources are in 'red' status."""
+    def __init__(self):
+        self._last_alert: dict[str, float] = {}  # source_name -> epoch of last alert
+        self._smtp_host = os.environ.get("SMTP_HOST", "")
+        self._smtp_port = int(os.environ.get("SMTP_PORT", "587"))
+        self._smtp_user = os.environ.get("SMTP_USER", "")
+        self._smtp_password = os.environ.get("SMTP_PASSWORD", "")
+        self._alert_to = os.environ.get("ALERT_EMAIL_TO", "")
+        self._alert_from = os.environ.get("ALERT_EMAIL_FROM", self._smtp_user)
+    @property
+    def enabled(self) -> bool:
+        return bool(self._smtp_host and self._alert_to)
+    def check_and_alert(self, status: dict[str, Any]) -> list[str]:
+        """Check status and send alerts for red sources. Returns list of alerted sources."""
+        if not self.enabled:
+            return []
+        alerted: list[str] = []
+        sources = status.get("sources", {})
+        for source_name, info in sources.items():
+            if info.get("status") != "red":
+                # Source recovered — clear cooldown so next outage triggers immediately
+                self._last_alert.pop(source_name, None)
+                continue
+            # Check cooldown
+            now = time.time()
+            last = self._last_alert.get(source_name, 0)
+            if (now - last) < ALERT_COOLDOWN_MIN * 60:
+                continue
+            # Send alert
+            message = info.get("message", f"{source_name} is down")
+            age = info.get("age_minutes")
+            subject = f"[SolarWine] Data flow alert: {source_name}"
+            body = (
+                f"Data source: {source_name}\n"
+                f"Status: RED\n"
+                f"Age: {age:.0f} min\n" if age is not None else ""
+                f"Detail: {message}\n"
+                f"\nChecked at: {status.get('checked_at', 'unknown')}\n"
+                f"Overall system status: {status.get('overall', 'unknown')}\n"
+                f"\n---\nSolarWine Data Flow Monitor"
+            )
+            if self._send_email(subject, body):
+                self._last_alert[source_name] = now
+                alerted.append(source_name)
+        return alerted
+    def _send_email(self, subject: str, body: str) -> bool:
+        """Send an email via SMTP. Returns True on success."""
+        try:
+            msg = MIMEText(body, "plain", "utf-8")
+            msg["Subject"] = subject
+            msg["From"] = self._alert_from
+            msg["To"] = self._alert_to
+            with smtplib.SMTP(self._smtp_host, self._smtp_port, timeout=10) as server:
+                server.starttls()
+                if self._smtp_user and self._smtp_password:
+                    server.login(self._smtp_user, self._smtp_password)
+                server.sendmail(self._alert_from, self._alert_to.split(","), msg.as_string())
+            log.info("Alert email sent: %s → %s", subject, self._alert_to)
+            return True
+        except Exception as exc:
+            log.error("Failed to send alert email: %s", exc)
+            return False

config/settings.py CHANGED Viewed

@@ -202,3 +202,18 @@ DP_BASE_CROP_VALUE = 0.10
 SIMULATION_LOG_DIR = DATA_DIR / "simulation_logs"
 SIMULATION_LOG_PATH = SIMULATION_LOG_DIR / "control_loop.parquet"
 DAILY_PLAN_PATH = DATA_DIR / "daily_plan.json"

 SIMULATION_LOG_DIR = DATA_DIR / "simulation_logs"
 SIMULATION_LOG_PATH = SIMULATION_LOG_DIR / "control_loop.parquet"
 DAILY_PLAN_PATH = DATA_DIR / "daily_plan.json"
+# ---------------------------------------------------------------------------
+# Data Flow Monitoring
+# ---------------------------------------------------------------------------
+# Staleness thresholds (minutes) — green → yellow → red
+IMS_STALE_YELLOW_MIN = 60       # IMS weather data older than this = yellow
+IMS_STALE_RED_MIN = 180         # IMS weather data older than this = red
+TB_STALE_YELLOW_MIN = 15        # ThingsBoard sensor data older than this = yellow
+TB_STALE_RED_MIN = 60           # ThingsBoard sensor data older than this = red
+ENERGY_STALE_YELLOW_MIN = 15    # Energy telemetry older than this = yellow
+ENERGY_STALE_RED_MIN = 60       # Energy telemetry older than this = red
+# Email alerts (activated when SMTP_HOST + ALERT_EMAIL_TO env vars are set)
+ALERT_COOLDOWN_MIN = 60         # minimum minutes between repeat alerts for same source

src/chatbot/guardrails.py CHANGED Viewed

@@ -360,4 +360,91 @@ def tag_tool_result(tool_name: str, tool_result: dict) -> dict:
                 "Warn the user that conditions may have changed."
             )
     return tagged

                 "Warn the user that conditions may have changed."
             )
+    # Validate numeric ranges — flag physically impossible values
+    range_warnings = validate_numeric_ranges(tool_name, tool_result)
+    if range_warnings:
+        tagged["_range_warnings"] = range_warnings
     return tagged
+# ---------------------------------------------------------------------------
+# 5. Numeric range validation — catch sensor faults & model errors
+# ---------------------------------------------------------------------------
+# Physical bounds for common fields (field_name → (min, max, unit))
+_PHYSICAL_BOUNDS: dict[str, tuple[float, float, str]] = {
+    "air_temperature_c": (-10.0, 55.0, "°C"),
+    "ghi_w_m2": (0.0, 1400.0, "W/m²"),
+    "rh_percent": (0.0, 100.0, "%"),
+    "wind_speed_ms": (0.0, 50.0, "m/s"),
+    "A_net": (-5.0, 40.0, "µmol CO₂/m²/s"),
+    "power_kw": (0.0, 60.0, "kW"),
+    "daily_kwh": (0.0, 500.0, "kWh"),
+    "PAR": (0.0, 2500.0, "µmol/m²/s"),
+    "Tleaf": (-5.0, 60.0, "°C"),
+    "VPD": (0.0, 10.0, "kPa"),
+    "CO2": (200.0, 800.0, "ppm"),
+    "CWSI": (0.0, 1.0, ""),
+    "staleness_minutes": (0.0, 1440.0, "min"),
+}
+def validate_numeric_ranges(tool_name: str, result: dict) -> list[str]:
+    """Check tool result values against physical bounds.
+    Returns a list of warning strings for out-of-range values.
+    """
+    warnings: list[str] = []
+    for key, (lo, hi, unit) in _PHYSICAL_BOUNDS.items():
+        val = result.get(key)
+        if val is None:
+            continue
+        try:
+            v = float(val)
+        except (TypeError, ValueError):
+            continue
+        if v < lo or v > hi:
+            warnings.append(
+                f"{key}={v:.1f}{unit} is outside physical range "
+                f"[{lo:.0f}–{hi:.0f}] — possible sensor fault"
+            )
+    return warnings
+# ---------------------------------------------------------------------------
+# 6. Cross-source consistency check
+# ---------------------------------------------------------------------------
+def check_cross_source_consistency(
+    weather: Optional[dict],
+    sensors: Optional[dict],
+) -> list[str]:
+    """Compare IMS weather and TB sensor readings for consistency.
+    Returns a list of caveat strings when sources diverge significantly.
+    """
+    caveats: list[str] = []
+    if not weather or not sensors:
+        return caveats
+    if "error" in weather or "error" in sensors:
+        return caveats
+    # Temperature: IMS air temp vs TB treatment air temp
+    ims_temp = weather.get("air_temperature_c")
+    tb_temp = sensors.get("treatment_air_temp_c")
+    if ims_temp is not None and tb_temp is not None:
+        try:
+            diff = abs(float(ims_temp) - float(tb_temp))
+            if diff > 5.0:
+                caveats.append(
+                    f"IMS air temperature ({float(ims_temp):.1f}°C) and on-site sensor "
+                    f"({float(tb_temp):.1f}°C) differ by {diff:.1f}°C — one source may "
+                    f"be stale or malfunctioning."
+                )
+        except (TypeError, ValueError):
+            pass
+    return caveats

src/chatbot/vineyard_chatbot.py CHANGED Viewed

@@ -26,6 +26,7 @@ from typing import Optional
 from src.data_providers import DataHub
 from src.genai_utils import extract_json_object, get_genai_client, get_google_api_key
 from src.chatbot.guardrails import (
     classify_query,
     estimate_confidence,
     get_source_label,
@@ -323,17 +324,30 @@ _RULE_KEYWORDS = {
 _PINNED_RULES = {"no_shade_before_10", "energy_budget", "temperature_transition"}
-def retrieve_relevant_rules(query: str, max_rules: int = 5) -> list[str]:
     """Retrieve the most relevant biology rules for a query.
     Returns up to ``max_rules`` rule names, always including pinned rules.
-    Uses keyword matching (no vector DB needed for 13 rules).
     """
     query_lower = query.lower()
-    scores: dict[str, int] = {}
     for rule_name, keywords in _RULE_KEYWORDS.items():
-        score = sum(1 for kw in keywords if kw in query_lower)
         if score > 0:
             scores[rule_name] = score
@@ -345,7 +359,6 @@ def retrieve_relevant_rules(query: str, max_rules: int = 5) -> list[str]:
             break
         selected.add(name)
-    # If we still have room, add remaining pinned rules
     return [r for r in BIOLOGY_RULES if r in selected]
@@ -943,6 +956,24 @@ class VineyardChatbot:
                     f"Data is {data_age:.0f} minutes old — conditions may have changed."
                 )
             # Build sources list
             sources: list[str] = []
             if tool_name:

 from src.data_providers import DataHub
 from src.genai_utils import extract_json_object, get_genai_client, get_google_api_key
 from src.chatbot.guardrails import (
+    check_cross_source_consistency,
     classify_query,
     estimate_confidence,
     get_source_label,
 _PINNED_RULES = {"no_shade_before_10", "energy_budget", "temperature_transition"}
+def retrieve_relevant_rules(query: str, max_rules: int = 6) -> list[str]:
     """Retrieve the most relevant biology rules for a query.
     Returns up to ``max_rules`` rule names, always including pinned rules.
+    Uses weighted keyword matching with partial-match support:
+      - Exact keyword match: +2 points
+      - Partial word overlap: +1 point (e.g. "irrigat" matches "irrigation")
     """
     query_lower = query.lower()
+    query_words = set(re.findall(r'\w+', query_lower))
+    scores: dict[str, float] = {}
     for rule_name, keywords in _RULE_KEYWORDS.items():
+        score = 0.0
+        for kw in keywords:
+            if kw in query_lower:
+                # Exact substring match — strong signal
+                score += 2.0
+            else:
+                # Partial word overlap — weaker signal
+                kw_words = set(re.findall(r'\w+', kw))
+                overlap = kw_words & query_words
+                if overlap:
+                    score += len(overlap) * 0.5
         if score > 0:
             scores[rule_name] = score
             break
         selected.add(name)
     return [r for r in BIOLOGY_RULES if r in selected]
                     f"Data is {data_age:.0f} minutes old — conditions may have changed."
                 )
+            # Range validation warnings
+            if tool_result:
+                range_warnings = tool_result.get("_range_warnings") or (
+                    tagged_result.get("_range_warnings") if tool_call else None
+                )
+                if range_warnings:
+                    for rw in range_warnings:
+                        caveats.append(rw)
+            # Cross-source consistency check (when we have both weather + sensors)
+            try:
+                wx_data = self.hub.weather.get_current()
+                sensor_data = self.hub.vine_sensors.get_snapshot(light=True)
+                consistency_caveats = check_cross_source_consistency(wx_data, sensor_data)
+                caveats.extend(consistency_caveats)
+            except Exception:
+                pass
             # Build sources list
             sources: list[str] = []
             if tool_name: