Spaces:

DevodG
/

Janus-backend

Running

DevodG commited on 14 days ago

Commit

7c8ef89

1 Parent(s): 6a7281e

stabilize: remove duplicate daemon, non-fatal config, lazy graph, feature flags, gated routers

- Remove duplicate daemon start (was started at module level AND startup event)
- Replace sys.exit(1) in config validation with warning log (degraded mode)
- Lazy graph compilation with error handling (no import-time crash)
- Add feature flag system (daemon, learning, sentinel, simulation, adaptive, self_training, experimental)
- Gate optional routers behind feature flags (learning=false, sentinel=false by default)
- Move service initialization to startup event (no import-time side effects)
- Add ensure_data_dirs() for idempotent runtime directory creation
- Add .env.example with all vars and feature flag defaults
- Add /health/features endpoint for feature flag status

Files changed (4) hide show

backend/.env.example +23 -0
backend/app/config.py +62 -8
backend/app/graph.py +31 -2
backend/app/main.py +111 -52

backend/.env.example CHANGED Viewed

@@ -82,3 +82,26 @@ SIMULATION_TRIGGER_KEYWORDS=simulate,predict,what if,reaction,scenario,public op
 # ---------- Domain Packs ----------
 FINANCE_DOMAIN_PACK_ENABLED=true

 # ---------- Domain Packs ----------
 FINANCE_DOMAIN_PACK_ENABLED=true
+# ---------- Feature Flags (default: safe/minimal) ----------
+# Control which subsystems start. Defaults minimize blast radius.
+FEATURE_DAEMON=true
+FEATURE_LEARNING=false
+FEATURE_SENTINEL=false
+FEATURE_SIMULATION=true
+FEATURE_ADAPTIVE=false
+FEATURE_SELF_TRAINING=false
+FEATURE_EXPERIMENTAL=false
+# ---------- Learning Layer ----------
+LEARNING_ENABLED=false
+KNOWLEDGE_MAX_SIZE_MB=200
+LEARNING_SCHEDULE_INTERVAL=6
+LEARNING_BATCH_SIZE=10
+LEARNING_TOPICS=finance,markets,technology,policy
+# ---------- Performance ----------
+REQUEST_TIMEOUT_SECONDS=30
+MAX_RESEARCH_SOURCES=5
+CACHE_TTL_GENERIC_DAYS=30
+CACHE_TTL_SPECIFIC_DAYS=7

backend/app/config.py CHANGED Viewed

@@ -85,7 +85,6 @@ FINANCE_DOMAIN_PACK_ENABLED = (
 # Configuration validation
 import logging
-import sys
 logger = logging.getLogger(__name__)
@@ -177,12 +176,13 @@ def validate_config():
     except Exception as e:
         errors.append(f"Failed to create data directories: {e}")
-    # Log results
     if errors:
-        logger.error("Configuration validation failed with errors:")
         for error in errors:
             logger.error(f"  - {error}")
-        sys.exit(1)
     if warnings:
         logger.warning("Configuration validation completed with warnings:")
@@ -191,13 +191,67 @@ def validate_config():
     else:
         logger.info("Configuration validation passed")
-# Run validation on import (startup)
-validate_config()
-# Learning layer configuration
-LEARNING_ENABLED = os.getenv("LEARNING_ENABLED", "true").lower() == "true"
 KNOWLEDGE_MAX_SIZE_MB = int(os.getenv("KNOWLEDGE_MAX_SIZE_MB", "200"))
 LEARNING_SCHEDULE_INTERVAL = int(os.getenv("LEARNING_SCHEDULE_INTERVAL", "6"))  # hours
 LEARNING_BATCH_SIZE = int(os.getenv("LEARNING_BATCH_SIZE", "10"))

 # Configuration validation
 import logging
 logger = logging.getLogger(__name__)
     except Exception as e:
         errors.append(f"Failed to create data directories: {e}")
+    # Log results — NEVER exit, always allow degraded mode
     if errors:
+        logger.error(
+            "Configuration validation errors (app will start in degraded mode):"
+        )
         for error in errors:
             logger.error(f"  - {error}")
     if warnings:
         logger.warning("Configuration validation completed with warnings:")
     else:
         logger.info("Configuration validation passed")
+    return warnings
+# ── Data directory initialization ────────────────────────────────────────────
+ALL_DATA_DIRS = [
+    DATA_DIR,
+    MEMORY_DIR,
+    SIMULATION_DIR,
+    DATA_DIR / "memory",
+    DATA_DIR / "simulations",
+    DATA_DIR / "logs",
+    DATA_DIR / "knowledge",
+    DATA_DIR / "skills",
+    DATA_DIR / "prompt_versions",
+    DATA_DIR / "learning",
+    DATA_DIR / "cache",
+    DATA_DIR / "adaptive",
+    DATA_DIR / "sentinel",
+    DATA_DIR / "sentinel" / "pending_patches",
+    DATA_DIR / "curiosity",
+    DATA_DIR / "daemon",
+    DATA_DIR / "dreams",
+    DATA_DIR / "memory_graph",
+    DATA_DIR / "router_state",
+]
+def ensure_data_dirs():
+    """Idempotent: create all runtime data dirs. Call once at startup."""
+    for d in ALL_DATA_DIRS:
+        try:
+            d.mkdir(parents=True, exist_ok=True)
+        except Exception as e:
+            logger.warning(f"Failed to create data dir {d}: {e}")
+# ── Feature Flags ────────────────────────────────────────────────────────────
+FEATURES = {
+    "daemon": os.getenv("FEATURE_DAEMON", "true").lower() == "true",
+    "learning": os.getenv("FEATURE_LEARNING", "false").lower() == "true",
+    "sentinel": os.getenv("FEATURE_SENTINEL", "false").lower() == "true",
+    "simulation": os.getenv("FEATURE_SIMULATION", "true").lower() == "true",
+    "adaptive": os.getenv("FEATURE_ADAPTIVE", "false").lower() == "true",
+    "self_training": os.getenv("FEATURE_SELF_TRAINING", "false").lower() == "true",
+    "experimental": os.getenv("FEATURE_EXPERIMENTAL", "false").lower() == "true",
+}
+def get_feature_status():
+    """Return current feature flag status."""
+    return {
+        name: {"enabled": enabled, "env_var": f"FEATURE_{name.upper()}"}
+        for name, enabled in FEATURES.items()
+    }
+# ── Learning layer configuration ─────────────────────────────────────────────
+LEARNING_ENABLED = os.getenv("LEARNING_ENABLED", "false").lower() == "true"
 KNOWLEDGE_MAX_SIZE_MB = int(os.getenv("KNOWLEDGE_MAX_SIZE_MB", "200"))
 LEARNING_SCHEDULE_INTERVAL = int(os.getenv("LEARNING_SCHEDULE_INTERVAL", "6"))  # hours
 LEARNING_BATCH_SIZE = int(os.getenv("LEARNING_BATCH_SIZE", "10"))

backend/app/graph.py CHANGED Viewed

@@ -115,11 +115,40 @@ def build_graph():
     return g.compile()
-compiled_graph = build_graph()
 def run_case(user_input: str, context: dict = None) -> dict:
     """Run the optimized agent pipeline on user input."""
     case_id = str(uuid.uuid4())
     t0 = time.perf_counter()
     logger.info("Starting case %s", case_id)
@@ -135,7 +164,7 @@ def run_case(user_input: str, context: dict = None) -> dict:
     if context:
         initial_state["context"] = context
-    result = compiled_graph.invoke(initial_state)
     elapsed = time.perf_counter() - t0
     logger.info("Case %s completed in %.2fs", case_id, elapsed)

     return g.compile()
+# Lazy graph compilation — prevents import-time crash if agents fail to load
+_compiled_graph = None
+_graph_build_error = None
+def get_compiled_graph():
+    """Lazy graph compilation with error handling. Call at runtime, not import."""
+    global _compiled_graph, _graph_build_error
+    if _compiled_graph is not None:
+        return _compiled_graph
+    if _graph_build_error is not None:
+        raise RuntimeError(f"Graph compilation previously failed: {_graph_build_error}")
+    try:
+        _compiled_graph = build_graph()
+        logger.info("LangGraph pipeline compiled successfully")
+        return _compiled_graph
+    except Exception as e:
+        _graph_build_error = str(e)
+        logger.error(f"LangGraph build failed: {e}")
+        raise
+def graph_status():
+    """Return graph compilation status without triggering compilation."""
+    if _compiled_graph is not None:
+        return {"status": "ready"}
+    if _graph_build_error:
+        return {"status": "failed", "error": _graph_build_error}
+    return {"status": "not_compiled"}
 def run_case(user_input: str, context: dict = None) -> dict:
     """Run the optimized agent pipeline on user input."""
+    graph = get_compiled_graph()
     case_id = str(uuid.uuid4())
     t0 = time.perf_counter()
     logger.info("Starting case %s", case_id)
     if context:
         initial_state["context"] = context
+    result = graph.invoke(initial_state)
     elapsed = time.perf_counter() - t0
     logger.info("Case %s completed in %.2fs", case_id, elapsed)

backend/app/main.py CHANGED Viewed

@@ -41,7 +41,8 @@ from app.routers.learning import (
 )
 from app.routers.sentinel import router as sentinel_router
 from app.routers.finance import router as finance_router
-from app.config import get_config
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -53,14 +54,8 @@ from app.domain_packs.init_packs import init_domain_packs
 init_domain_packs()
-# Initialize learning layer
 config = get_config()
-if config.learning_enabled:
-    try:
-        init_learning_services(config)
-        logger.info("Learning layer initialized")
-    except Exception as e:
-        logger.error(f"Failed to initialize learning layer: {e}")
 app.add_middleware(
     CORSMiddleware,
@@ -75,11 +70,30 @@ app.add_middleware(
     allow_headers=["*"],
 )
-app.include_router(simulation_router)
-app.include_router(learning_router)
-app.include_router(sentinel_router)
 app.include_router(finance_router)
 # ── Request Timing Middleware ─────────────────────────────────────────────────
@@ -119,16 +133,38 @@ async def global_exception_handler(request: Request, exc: Exception):
 @app.on_event("startup")
 async def on_startup():
     """Start background tasks on app startup."""
-    if config.learning_enabled:
         try:
             start_scheduler_background()
-            logger.info("Background learning scheduler started")
         except Exception as e:
-            logger.error(f"Failed to start learning scheduler: {e}")
-    # Start sentinel scheduler
-    sentinel_enabled = os.getenv("SENTINEL_ENABLED", "true").lower() == "true"
-    if sentinel_enabled:
         try:
             from app.services.sentinel.scheduler import start_sentinel_scheduler
@@ -137,13 +173,12 @@ async def on_startup():
         except Exception as e:
             logger.error(f"Failed to start sentinel scheduler: {e}")
-    # Start Janus daemon in background thread
-    daemon_enabled = os.getenv("DAEMON_ENABLED", "true").lower() == "true"
-    if daemon_enabled:
         try:
             import threading
-            from app.services.daemon import janus_daemon
             daemon_thread = threading.Thread(
                 target=janus_daemon.run, daemon=True, name="janus-daemon"
             )
@@ -153,6 +188,12 @@ async def on_startup():
             logger.error(f"Failed to start Janus daemon: {e}")
 # ── Health ────────────────────────────────────────────────────────────────────
@@ -166,6 +207,14 @@ def health_deep():
     return deep_health()
 @app.get("/context")
 def get_context():
     """Get the current system context."""
@@ -273,6 +322,22 @@ def get_training_report():
     return self_training_engine.get_training_report()
 @app.get("/config/status")
 def config_status():
     return {
@@ -313,34 +378,20 @@ def agent_detail(agent_name: str):
 # ── Caching & Intelligence Services ──────────────────────────────────────────
-query_classifier = QueryClassifier()
-cache_manager = IntelligentCacheManager()
-learning_filter = LearningFilter()
-memory_graph = MemoryGraph()
 # ── Background Daemon ────────────────────────────────────────────────────────
 janus_daemon = None
-def start_janus_daemon():
-    """Start the background intelligence daemon."""
-    global janus_daemon
-    try:
-        janus_daemon = JanusDaemon()
-        import threading
-        thread = threading.Thread(target=janus_daemon.run, daemon=True)
-        thread.start()
-        logger.info("Janus background daemon started")
-    except Exception as e:
-        logger.error(f"Failed to start Janus daemon: {e}")
-start_janus_daemon()
 # ── Case Execution ────────────────────────────────────────────────────────────
@@ -381,6 +432,13 @@ def _fire_and_forget_learning(payload: dict):
 @app.post("/run")
 def run_org(task: UserTask):
     try:
         user_input = task.user_input
         logger.info("Processing /run: %s", user_input[:100])
@@ -671,11 +729,6 @@ def case_delete(case_id: str):
     return {"deleted": True, "case_id": case_id}
-@app.get("/memory/stats")
-def memory_stats_endpoint():
-    return memory_stats()
 # ── Prompts ───────────────────────────────────────────────────────────────────
@@ -853,9 +906,15 @@ def trigger_curiosity_cycle():
 @app.get("/memory/stats")
-def memory_graph_stats():
-    """Get memory graph statistics."""
-    return memory_graph.get_stats()
 @app.get("/memory/queries")

 )
 from app.routers.sentinel import router as sentinel_router
 from app.routers.finance import router as finance_router
+from app.config import get_config, FEATURES, ensure_data_dirs
+from app.services.dataset_persistence import load_on_startup, save_on_shutdown
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 init_domain_packs()
+# Config is needed for feature flags; learning services init moved to startup event
 config = get_config()
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
+# ── Router Registration ──────────────────────────────────────────────────────
+# Always-on: finance (core domain pack)
 app.include_router(finance_router)
+# Feature-gated routers
+if FEATURES.get("simulation", True):
+    app.include_router(simulation_router)
+    logger.info("Simulation router enabled")
+else:
+    logger.info("Simulation router disabled (FEATURE_SIMULATION=false)")
+if FEATURES.get("learning", False):
+    app.include_router(learning_router)
+    logger.info("Learning router enabled")
+else:
+    logger.info("Learning router disabled (FEATURE_LEARNING=false)")
+if FEATURES.get("sentinel", False):
+    app.include_router(sentinel_router)
+    logger.info("Sentinel router enabled")
+else:
+    logger.info("Sentinel router disabled (FEATURE_SENTINEL=false)")
 # ── Request Timing Middleware ─────────────────────────────────────────────────
 @app.on_event("startup")
 async def on_startup():
     """Start background tasks on app startup."""
+    global query_classifier, cache_manager, learning_filter, memory_graph, janus_daemon
+    # Step 1: Create all runtime data directories (idempotent)
+    ensure_data_dirs()
+    # Step 2: Restore daemon data from dataset repo (non-blocking)
+    try:
+        load_on_startup()
+    except Exception as e:
+        logger.warning(f"Dataset persistence unavailable: {e}")
+    # Step 3: Initialize core services (always)
+    try:
+        query_classifier = QueryClassifier()
+        cache_manager = IntelligentCacheManager()
+        learning_filter = LearningFilter()
+        memory_graph = MemoryGraph()
+        logger.info("Core services initialized")
+    except Exception as e:
+        logger.error(f"Core services init failed: {e} — continuing in degraded mode")
+    # Step 4: Initialize learning layer (feature-gated)
+    if FEATURES.get("learning", False) and config.learning_enabled:
         try:
+            init_learning_services(config)
             start_scheduler_background()
+            logger.info("Learning layer + scheduler started")
         except Exception as e:
+            logger.error(f"Failed to start learning layer: {e}")
+    # Step 5: Start sentinel scheduler (feature-gated)
+    if FEATURES.get("sentinel", False):
         try:
             from app.services.sentinel.scheduler import start_sentinel_scheduler
         except Exception as e:
             logger.error(f"Failed to start sentinel scheduler: {e}")
+    # Step 6: Start Janus daemon in background thread (feature-gated)
+    if FEATURES.get("daemon", True):
         try:
             import threading
+            janus_daemon = JanusDaemon()
             daemon_thread = threading.Thread(
                 target=janus_daemon.run, daemon=True, name="janus-daemon"
             )
             logger.error(f"Failed to start Janus daemon: {e}")
+@app.on_event("shutdown")
+async def on_shutdown():
+    """Save daemon data to dataset repo before shutdown."""
+    save_on_shutdown()
 # ── Health ────────────────────────────────────────────────────────────────────
     return deep_health()
+@app.get("/health/features")
+def feature_status():
+    """Get current feature flag status."""
+    from app.config import get_feature_status
+    return get_feature_status()
 @app.get("/context")
 def get_context():
     """Get the current system context."""
     return self_training_engine.get_training_report()
+@app.get("/self/continuous-training")
+def get_continuous_training_status():
+    """Get continuous self-training status."""
+    from app.services.continuous_training import continuous_self_trainer
+    return continuous_self_trainer.get_status()
+@app.post("/self/continuous-training/run")
+def trigger_continuous_training():
+    """Manually trigger a continuous training cycle."""
+    from app.services.continuous_training import continuous_self_trainer
+    return continuous_self_trainer.run_training_cycle()
 @app.get("/config/status")
 def config_status():
     return {
 # ── Caching & Intelligence Services ──────────────────────────────────────────
+# These are initialized in the startup event to avoid import-time side effects.
+# Declared here as module-level None so endpoints can reference them.
+query_classifier = None
+cache_manager = None
+learning_filter = None
+memory_graph = None
 # ── Background Daemon ────────────────────────────────────────────────────────
+# Daemon is started in the startup event (on_startup), NOT here.
+# Starting it at module level creates a duplicate thread with the startup event,
+# causing data races on shared state (_pending_thoughts, signal_queue, files).
 janus_daemon = None
 # ── Case Execution ────────────────────────────────────────────────────────────
 @app.post("/run")
 def run_org(task: UserTask):
     try:
+        # Guard: core services must be available
+        if query_classifier is None:
+            raise HTTPException(
+                status_code=503,
+                detail="Core services unavailable — app is in degraded mode",
+            )
         user_input = task.user_input
         logger.info("Processing /run: %s", user_input[:100])
     return {"deleted": True, "case_id": case_id}
 # ── Prompts ───────────────────────────────────────────────────────────────────
 @app.get("/memory/stats")
+def memory_stats_endpoint():
+    """Get memory graph statistics for the pulse page."""
+    stats = memory_graph.get_stats()
+    return {
+        "queries": stats.get("total_queries", 0),
+        "entities": stats.get("total_entities", 0),
+        "links": stats.get("total_links", 0),
+        "domains": stats.get("domain_counts", {}),
+    }
 @app.get("/memory/queries")