DevodG commited on
Commit
7c8ef89
Β·
1 Parent(s): 6a7281e

stabilize: remove duplicate daemon, non-fatal config, lazy graph, feature flags, gated routers

Browse files

- Remove duplicate daemon start (was started at module level AND startup event)
- Replace sys.exit(1) in config validation with warning log (degraded mode)
- Lazy graph compilation with error handling (no import-time crash)
- Add feature flag system (daemon, learning, sentinel, simulation, adaptive, self_training, experimental)
- Gate optional routers behind feature flags (learning=false, sentinel=false by default)
- Move service initialization to startup event (no import-time side effects)
- Add ensure_data_dirs() for idempotent runtime directory creation
- Add .env.example with all vars and feature flag defaults
- Add /health/features endpoint for feature flag status

backend/.env.example CHANGED
@@ -82,3 +82,26 @@ SIMULATION_TRIGGER_KEYWORDS=simulate,predict,what if,reaction,scenario,public op
82
 
83
  # ---------- Domain Packs ----------
84
  FINANCE_DOMAIN_PACK_ENABLED=true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # ---------- Domain Packs ----------
84
  FINANCE_DOMAIN_PACK_ENABLED=true
85
+
86
+ # ---------- Feature Flags (default: safe/minimal) ----------
87
+ # Control which subsystems start. Defaults minimize blast radius.
88
+ FEATURE_DAEMON=true
89
+ FEATURE_LEARNING=false
90
+ FEATURE_SENTINEL=false
91
+ FEATURE_SIMULATION=true
92
+ FEATURE_ADAPTIVE=false
93
+ FEATURE_SELF_TRAINING=false
94
+ FEATURE_EXPERIMENTAL=false
95
+
96
+ # ---------- Learning Layer ----------
97
+ LEARNING_ENABLED=false
98
+ KNOWLEDGE_MAX_SIZE_MB=200
99
+ LEARNING_SCHEDULE_INTERVAL=6
100
+ LEARNING_BATCH_SIZE=10
101
+ LEARNING_TOPICS=finance,markets,technology,policy
102
+
103
+ # ---------- Performance ----------
104
+ REQUEST_TIMEOUT_SECONDS=30
105
+ MAX_RESEARCH_SOURCES=5
106
+ CACHE_TTL_GENERIC_DAYS=30
107
+ CACHE_TTL_SPECIFIC_DAYS=7
backend/app/config.py CHANGED
@@ -85,7 +85,6 @@ FINANCE_DOMAIN_PACK_ENABLED = (
85
 
86
  # Configuration validation
87
  import logging
88
- import sys
89
 
90
  logger = logging.getLogger(__name__)
91
 
@@ -177,12 +176,13 @@ def validate_config():
177
  except Exception as e:
178
  errors.append(f"Failed to create data directories: {e}")
179
 
180
- # Log results
181
  if errors:
182
- logger.error("Configuration validation failed with errors:")
 
 
183
  for error in errors:
184
  logger.error(f" - {error}")
185
- sys.exit(1)
186
 
187
  if warnings:
188
  logger.warning("Configuration validation completed with warnings:")
@@ -191,13 +191,67 @@ def validate_config():
191
  else:
192
  logger.info("Configuration validation passed")
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- # Run validation on import (startup)
196
- validate_config()
197
 
 
198
 
199
- # Learning layer configuration
200
- LEARNING_ENABLED = os.getenv("LEARNING_ENABLED", "true").lower() == "true"
201
  KNOWLEDGE_MAX_SIZE_MB = int(os.getenv("KNOWLEDGE_MAX_SIZE_MB", "200"))
202
  LEARNING_SCHEDULE_INTERVAL = int(os.getenv("LEARNING_SCHEDULE_INTERVAL", "6")) # hours
203
  LEARNING_BATCH_SIZE = int(os.getenv("LEARNING_BATCH_SIZE", "10"))
 
85
 
86
  # Configuration validation
87
  import logging
 
88
 
89
  logger = logging.getLogger(__name__)
90
 
 
176
  except Exception as e:
177
  errors.append(f"Failed to create data directories: {e}")
178
 
179
+ # Log results β€” NEVER exit, always allow degraded mode
180
  if errors:
181
+ logger.error(
182
+ "Configuration validation errors (app will start in degraded mode):"
183
+ )
184
  for error in errors:
185
  logger.error(f" - {error}")
 
186
 
187
  if warnings:
188
  logger.warning("Configuration validation completed with warnings:")
 
191
  else:
192
  logger.info("Configuration validation passed")
193
 
194
+ return warnings
195
+
196
+
197
+ # ── Data directory initialization ────────────────────────────────────────────
198
+
199
+ ALL_DATA_DIRS = [
200
+ DATA_DIR,
201
+ MEMORY_DIR,
202
+ SIMULATION_DIR,
203
+ DATA_DIR / "memory",
204
+ DATA_DIR / "simulations",
205
+ DATA_DIR / "logs",
206
+ DATA_DIR / "knowledge",
207
+ DATA_DIR / "skills",
208
+ DATA_DIR / "prompt_versions",
209
+ DATA_DIR / "learning",
210
+ DATA_DIR / "cache",
211
+ DATA_DIR / "adaptive",
212
+ DATA_DIR / "sentinel",
213
+ DATA_DIR / "sentinel" / "pending_patches",
214
+ DATA_DIR / "curiosity",
215
+ DATA_DIR / "daemon",
216
+ DATA_DIR / "dreams",
217
+ DATA_DIR / "memory_graph",
218
+ DATA_DIR / "router_state",
219
+ ]
220
+
221
+
222
+ def ensure_data_dirs():
223
+ """Idempotent: create all runtime data dirs. Call once at startup."""
224
+ for d in ALL_DATA_DIRS:
225
+ try:
226
+ d.mkdir(parents=True, exist_ok=True)
227
+ except Exception as e:
228
+ logger.warning(f"Failed to create data dir {d}: {e}")
229
+
230
+
231
+ # ── Feature Flags ────────────────────────────────────────────────────────────
232
+
233
+ FEATURES = {
234
+ "daemon": os.getenv("FEATURE_DAEMON", "true").lower() == "true",
235
+ "learning": os.getenv("FEATURE_LEARNING", "false").lower() == "true",
236
+ "sentinel": os.getenv("FEATURE_SENTINEL", "false").lower() == "true",
237
+ "simulation": os.getenv("FEATURE_SIMULATION", "true").lower() == "true",
238
+ "adaptive": os.getenv("FEATURE_ADAPTIVE", "false").lower() == "true",
239
+ "self_training": os.getenv("FEATURE_SELF_TRAINING", "false").lower() == "true",
240
+ "experimental": os.getenv("FEATURE_EXPERIMENTAL", "false").lower() == "true",
241
+ }
242
+
243
+
244
+ def get_feature_status():
245
+ """Return current feature flag status."""
246
+ return {
247
+ name: {"enabled": enabled, "env_var": f"FEATURE_{name.upper()}"}
248
+ for name, enabled in FEATURES.items()
249
+ }
250
 
 
 
251
 
252
+ # ── Learning layer configuration ─────────────────────────────────────────────
253
 
254
+ LEARNING_ENABLED = os.getenv("LEARNING_ENABLED", "false").lower() == "true"
 
255
  KNOWLEDGE_MAX_SIZE_MB = int(os.getenv("KNOWLEDGE_MAX_SIZE_MB", "200"))
256
  LEARNING_SCHEDULE_INTERVAL = int(os.getenv("LEARNING_SCHEDULE_INTERVAL", "6")) # hours
257
  LEARNING_BATCH_SIZE = int(os.getenv("LEARNING_BATCH_SIZE", "10"))
backend/app/graph.py CHANGED
@@ -115,11 +115,40 @@ def build_graph():
115
  return g.compile()
116
 
117
 
118
- compiled_graph = build_graph()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
 
121
  def run_case(user_input: str, context: dict = None) -> dict:
122
  """Run the optimized agent pipeline on user input."""
 
123
  case_id = str(uuid.uuid4())
124
  t0 = time.perf_counter()
125
  logger.info("Starting case %s", case_id)
@@ -135,7 +164,7 @@ def run_case(user_input: str, context: dict = None) -> dict:
135
  if context:
136
  initial_state["context"] = context
137
 
138
- result = compiled_graph.invoke(initial_state)
139
 
140
  elapsed = time.perf_counter() - t0
141
  logger.info("Case %s completed in %.2fs", case_id, elapsed)
 
115
  return g.compile()
116
 
117
 
118
+ # Lazy graph compilation β€” prevents import-time crash if agents fail to load
119
+ _compiled_graph = None
120
+ _graph_build_error = None
121
+
122
+
123
+ def get_compiled_graph():
124
+ """Lazy graph compilation with error handling. Call at runtime, not import."""
125
+ global _compiled_graph, _graph_build_error
126
+ if _compiled_graph is not None:
127
+ return _compiled_graph
128
+ if _graph_build_error is not None:
129
+ raise RuntimeError(f"Graph compilation previously failed: {_graph_build_error}")
130
+ try:
131
+ _compiled_graph = build_graph()
132
+ logger.info("LangGraph pipeline compiled successfully")
133
+ return _compiled_graph
134
+ except Exception as e:
135
+ _graph_build_error = str(e)
136
+ logger.error(f"LangGraph build failed: {e}")
137
+ raise
138
+
139
+
140
+ def graph_status():
141
+ """Return graph compilation status without triggering compilation."""
142
+ if _compiled_graph is not None:
143
+ return {"status": "ready"}
144
+ if _graph_build_error:
145
+ return {"status": "failed", "error": _graph_build_error}
146
+ return {"status": "not_compiled"}
147
 
148
 
149
  def run_case(user_input: str, context: dict = None) -> dict:
150
  """Run the optimized agent pipeline on user input."""
151
+ graph = get_compiled_graph()
152
  case_id = str(uuid.uuid4())
153
  t0 = time.perf_counter()
154
  logger.info("Starting case %s", case_id)
 
164
  if context:
165
  initial_state["context"] = context
166
 
167
+ result = graph.invoke(initial_state)
168
 
169
  elapsed = time.perf_counter() - t0
170
  logger.info("Case %s completed in %.2fs", case_id, elapsed)
backend/app/main.py CHANGED
@@ -41,7 +41,8 @@ from app.routers.learning import (
41
  )
42
  from app.routers.sentinel import router as sentinel_router
43
  from app.routers.finance import router as finance_router
44
- from app.config import get_config
 
45
 
46
  logging.basicConfig(level=logging.INFO)
47
  logger = logging.getLogger(__name__)
@@ -53,14 +54,8 @@ from app.domain_packs.init_packs import init_domain_packs
53
 
54
  init_domain_packs()
55
 
56
- # Initialize learning layer
57
  config = get_config()
58
- if config.learning_enabled:
59
- try:
60
- init_learning_services(config)
61
- logger.info("Learning layer initialized")
62
- except Exception as e:
63
- logger.error(f"Failed to initialize learning layer: {e}")
64
 
65
  app.add_middleware(
66
  CORSMiddleware,
@@ -75,11 +70,30 @@ app.add_middleware(
75
  allow_headers=["*"],
76
  )
77
 
78
- app.include_router(simulation_router)
79
- app.include_router(learning_router)
80
- app.include_router(sentinel_router)
81
  app.include_router(finance_router)
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # ── Request Timing Middleware ─────────────────────────────────────────────────
85
 
@@ -119,16 +133,38 @@ async def global_exception_handler(request: Request, exc: Exception):
119
  @app.on_event("startup")
120
  async def on_startup():
121
  """Start background tasks on app startup."""
122
- if config.learning_enabled:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  try:
 
124
  start_scheduler_background()
125
- logger.info("Background learning scheduler started")
126
  except Exception as e:
127
- logger.error(f"Failed to start learning scheduler: {e}")
128
 
129
- # Start sentinel scheduler
130
- sentinel_enabled = os.getenv("SENTINEL_ENABLED", "true").lower() == "true"
131
- if sentinel_enabled:
132
  try:
133
  from app.services.sentinel.scheduler import start_sentinel_scheduler
134
 
@@ -137,13 +173,12 @@ async def on_startup():
137
  except Exception as e:
138
  logger.error(f"Failed to start sentinel scheduler: {e}")
139
 
140
- # Start Janus daemon in background thread
141
- daemon_enabled = os.getenv("DAEMON_ENABLED", "true").lower() == "true"
142
- if daemon_enabled:
143
  try:
144
  import threading
145
- from app.services.daemon import janus_daemon
146
 
 
147
  daemon_thread = threading.Thread(
148
  target=janus_daemon.run, daemon=True, name="janus-daemon"
149
  )
@@ -153,6 +188,12 @@ async def on_startup():
153
  logger.error(f"Failed to start Janus daemon: {e}")
154
 
155
 
 
 
 
 
 
 
156
  # ── Health ────────────────────────────────────────────────────────────────────
157
 
158
 
@@ -166,6 +207,14 @@ def health_deep():
166
  return deep_health()
167
 
168
 
 
 
 
 
 
 
 
 
169
  @app.get("/context")
170
  def get_context():
171
  """Get the current system context."""
@@ -273,6 +322,22 @@ def get_training_report():
273
  return self_training_engine.get_training_report()
274
 
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  @app.get("/config/status")
277
  def config_status():
278
  return {
@@ -313,34 +378,20 @@ def agent_detail(agent_name: str):
313
 
314
 
315
  # ── Caching & Intelligence Services ──────────────────────────────────────────
316
-
317
- query_classifier = QueryClassifier()
318
- cache_manager = IntelligentCacheManager()
319
- learning_filter = LearningFilter()
320
- memory_graph = MemoryGraph()
 
321
 
322
  # ── Background Daemon ────────────────────────────────────────────────────────
323
-
 
 
324
  janus_daemon = None
325
 
326
 
327
- def start_janus_daemon():
328
- """Start the background intelligence daemon."""
329
- global janus_daemon
330
- try:
331
- janus_daemon = JanusDaemon()
332
- import threading
333
-
334
- thread = threading.Thread(target=janus_daemon.run, daemon=True)
335
- thread.start()
336
- logger.info("Janus background daemon started")
337
- except Exception as e:
338
- logger.error(f"Failed to start Janus daemon: {e}")
339
-
340
-
341
- start_janus_daemon()
342
-
343
-
344
  # ── Case Execution ────────────────────────────────────────────────────────────
345
 
346
 
@@ -381,6 +432,13 @@ def _fire_and_forget_learning(payload: dict):
381
  @app.post("/run")
382
  def run_org(task: UserTask):
383
  try:
 
 
 
 
 
 
 
384
  user_input = task.user_input
385
  logger.info("Processing /run: %s", user_input[:100])
386
 
@@ -671,11 +729,6 @@ def case_delete(case_id: str):
671
  return {"deleted": True, "case_id": case_id}
672
 
673
 
674
- @app.get("/memory/stats")
675
- def memory_stats_endpoint():
676
- return memory_stats()
677
-
678
-
679
  # ── Prompts ───────────────────────────────────────────────────────────────────
680
 
681
 
@@ -853,9 +906,15 @@ def trigger_curiosity_cycle():
853
 
854
 
855
  @app.get("/memory/stats")
856
- def memory_graph_stats():
857
- """Get memory graph statistics."""
858
- return memory_graph.get_stats()
 
 
 
 
 
 
859
 
860
 
861
  @app.get("/memory/queries")
 
41
  )
42
  from app.routers.sentinel import router as sentinel_router
43
  from app.routers.finance import router as finance_router
44
+ from app.config import get_config, FEATURES, ensure_data_dirs
45
+ from app.services.dataset_persistence import load_on_startup, save_on_shutdown
46
 
47
  logging.basicConfig(level=logging.INFO)
48
  logger = logging.getLogger(__name__)
 
54
 
55
  init_domain_packs()
56
 
57
+ # Config is needed for feature flags; learning services init moved to startup event
58
  config = get_config()
 
 
 
 
 
 
59
 
60
  app.add_middleware(
61
  CORSMiddleware,
 
70
  allow_headers=["*"],
71
  )
72
 
73
+ # ── Router Registration ──────────────────────────────────────────────────────
74
+
75
+ # Always-on: finance (core domain pack)
76
  app.include_router(finance_router)
77
 
78
+ # Feature-gated routers
79
+ if FEATURES.get("simulation", True):
80
+ app.include_router(simulation_router)
81
+ logger.info("Simulation router enabled")
82
+ else:
83
+ logger.info("Simulation router disabled (FEATURE_SIMULATION=false)")
84
+
85
+ if FEATURES.get("learning", False):
86
+ app.include_router(learning_router)
87
+ logger.info("Learning router enabled")
88
+ else:
89
+ logger.info("Learning router disabled (FEATURE_LEARNING=false)")
90
+
91
+ if FEATURES.get("sentinel", False):
92
+ app.include_router(sentinel_router)
93
+ logger.info("Sentinel router enabled")
94
+ else:
95
+ logger.info("Sentinel router disabled (FEATURE_SENTINEL=false)")
96
+
97
 
98
  # ── Request Timing Middleware ─────────────────────────────────────────────────
99
 
 
133
  @app.on_event("startup")
134
  async def on_startup():
135
  """Start background tasks on app startup."""
136
+ global query_classifier, cache_manager, learning_filter, memory_graph, janus_daemon
137
+
138
+ # Step 1: Create all runtime data directories (idempotent)
139
+ ensure_data_dirs()
140
+
141
+ # Step 2: Restore daemon data from dataset repo (non-blocking)
142
+ try:
143
+ load_on_startup()
144
+ except Exception as e:
145
+ logger.warning(f"Dataset persistence unavailable: {e}")
146
+
147
+ # Step 3: Initialize core services (always)
148
+ try:
149
+ query_classifier = QueryClassifier()
150
+ cache_manager = IntelligentCacheManager()
151
+ learning_filter = LearningFilter()
152
+ memory_graph = MemoryGraph()
153
+ logger.info("Core services initialized")
154
+ except Exception as e:
155
+ logger.error(f"Core services init failed: {e} β€” continuing in degraded mode")
156
+
157
+ # Step 4: Initialize learning layer (feature-gated)
158
+ if FEATURES.get("learning", False) and config.learning_enabled:
159
  try:
160
+ init_learning_services(config)
161
  start_scheduler_background()
162
+ logger.info("Learning layer + scheduler started")
163
  except Exception as e:
164
+ logger.error(f"Failed to start learning layer: {e}")
165
 
166
+ # Step 5: Start sentinel scheduler (feature-gated)
167
+ if FEATURES.get("sentinel", False):
 
168
  try:
169
  from app.services.sentinel.scheduler import start_sentinel_scheduler
170
 
 
173
  except Exception as e:
174
  logger.error(f"Failed to start sentinel scheduler: {e}")
175
 
176
+ # Step 6: Start Janus daemon in background thread (feature-gated)
177
+ if FEATURES.get("daemon", True):
 
178
  try:
179
  import threading
 
180
 
181
+ janus_daemon = JanusDaemon()
182
  daemon_thread = threading.Thread(
183
  target=janus_daemon.run, daemon=True, name="janus-daemon"
184
  )
 
188
  logger.error(f"Failed to start Janus daemon: {e}")
189
 
190
 
191
+ @app.on_event("shutdown")
192
+ async def on_shutdown():
193
+ """Save daemon data to dataset repo before shutdown."""
194
+ save_on_shutdown()
195
+
196
+
197
  # ── Health ────────────────────────────────────────────────────────────────────
198
 
199
 
 
207
  return deep_health()
208
 
209
 
210
+ @app.get("/health/features")
211
+ def feature_status():
212
+ """Get current feature flag status."""
213
+ from app.config import get_feature_status
214
+
215
+ return get_feature_status()
216
+
217
+
218
  @app.get("/context")
219
  def get_context():
220
  """Get the current system context."""
 
322
  return self_training_engine.get_training_report()
323
 
324
 
325
+ @app.get("/self/continuous-training")
326
+ def get_continuous_training_status():
327
+ """Get continuous self-training status."""
328
+ from app.services.continuous_training import continuous_self_trainer
329
+
330
+ return continuous_self_trainer.get_status()
331
+
332
+
333
+ @app.post("/self/continuous-training/run")
334
+ def trigger_continuous_training():
335
+ """Manually trigger a continuous training cycle."""
336
+ from app.services.continuous_training import continuous_self_trainer
337
+
338
+ return continuous_self_trainer.run_training_cycle()
339
+
340
+
341
  @app.get("/config/status")
342
  def config_status():
343
  return {
 
378
 
379
 
380
  # ── Caching & Intelligence Services ──────────────────────────────────────────
381
+ # These are initialized in the startup event to avoid import-time side effects.
382
+ # Declared here as module-level None so endpoints can reference them.
383
+ query_classifier = None
384
+ cache_manager = None
385
+ learning_filter = None
386
+ memory_graph = None
387
 
388
  # ── Background Daemon ────────────────────────────────────────────────────────
389
+ # Daemon is started in the startup event (on_startup), NOT here.
390
+ # Starting it at module level creates a duplicate thread with the startup event,
391
+ # causing data races on shared state (_pending_thoughts, signal_queue, files).
392
  janus_daemon = None
393
 
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  # ── Case Execution ────────────────────────────────────────────────────────────
396
 
397
 
 
432
  @app.post("/run")
433
  def run_org(task: UserTask):
434
  try:
435
+ # Guard: core services must be available
436
+ if query_classifier is None:
437
+ raise HTTPException(
438
+ status_code=503,
439
+ detail="Core services unavailable β€” app is in degraded mode",
440
+ )
441
+
442
  user_input = task.user_input
443
  logger.info("Processing /run: %s", user_input[:100])
444
 
 
729
  return {"deleted": True, "case_id": case_id}
730
 
731
 
 
 
 
 
 
732
  # ── Prompts ───────────────────────────────────────────────────────────────────
733
 
734
 
 
906
 
907
 
908
  @app.get("/memory/stats")
909
+ def memory_stats_endpoint():
910
+ """Get memory graph statistics for the pulse page."""
911
+ stats = memory_graph.get_stats()
912
+ return {
913
+ "queries": stats.get("total_queries", 0),
914
+ "entities": stats.get("total_entities", 0),
915
+ "links": stats.get("total_links", 0),
916
+ "domains": stats.get("domain_counts", {}),
917
+ }
918
 
919
 
920
  @app.get("/memory/queries")