Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 3 days ago

27.5 kB

	"""
	Codette Unified Memory — SQLite + FTS5 Backed Cocoon Store
	===========================================================

	Consolidates three previously separate memory systems:
	1. CognitionCocooner (JSON files on disk)
	2. LivingMemoryKernel (in-memory MemoryCocoons)
	3. CodetteSession (SQLite conversation state)

	Into ONE system with:
	- SQLite backing for persistence + ACID guarantees
	- FTS5 full-text search for fast relevance matching (replaces O(n) file scan)
	- In-memory LRU cache for hot cocoons
	- Unified API for store/recall/search
	- Migration from legacy JSON cocoons on first load

	Schema:
	cocoons(id, query, response, adapter, domain, complexity, emotion,
	importance, timestamp, metadata_json)
	cocoons_fts(query, response) -- FTS5 virtual table

	Author: Jonathan Harrison (Raiff's Bits LLC)
	"""

	import json
	import math
	import sqlite3
	import time
	import hashlib
	import os
	import logging
	from pathlib import Path
	from typing import Dict, List, Optional, Any, Tuple
	from collections import OrderedDict

	logger = logging.getLogger(__name__)

	DB_DIR = Path(__file__).parent.parent / "data"
	DB_PATH = DB_DIR / "codette_memory.db"
	LEGACY_COCOON_DIR = Path(__file__).parent.parent / "cocoons"

	# In-memory cache size
	CACHE_MAX = 200


	class UnifiedMemory:
	"""
	Single source of truth for all Codette memory.

	Replaces CognitionCocooner + LivingMemoryKernel + session memory
	with one SQLite-backed store using FTS5 for fast relevance search.
	"""

	def __init__(self, db_path: Optional[Path] = None,
	legacy_dir: Optional[Path] = None):
	self.db_path = db_path or DB_PATH
	self.db_path.parent.mkdir(parents=True, exist_ok=True)
	self.legacy_dir = legacy_dir or LEGACY_COCOON_DIR

	# In-memory LRU cache (id -> cocoon dict)
	self._cache: OrderedDict = OrderedDict()

	# Initialize database
	self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
	self._conn.row_factory = sqlite3.Row
	self._init_schema()

	# Stats
	self._total_stored = self._count()
	self._cache_hits = 0
	self._cache_misses = 0

	# Migrate legacy cocoons on first use
	if self._total_stored == 0 and self.legacy_dir.exists():
	self._migrate_legacy()

	logger.info(f"UnifiedMemory: {self._total_stored} cocoons in {self.db_path}")

	def _init_schema(self):
	"""Create tables and FTS5 index if they don't exist."""
	cur = self._conn.cursor()

	cur.execute("""
	CREATE TABLE IF NOT EXISTS cocoons (
	id TEXT PRIMARY KEY,
	query TEXT NOT NULL,
	response TEXT NOT NULL,
	adapter TEXT DEFAULT 'unknown',
	domain TEXT DEFAULT 'general',
	complexity TEXT DEFAULT 'MEDIUM',
	emotion TEXT DEFAULT 'neutral',
	importance INTEGER DEFAULT 7,
	timestamp REAL NOT NULL,
	metadata_json TEXT DEFAULT '{}'
	)
	""")

	# FTS5 virtual table for fast full-text search
	cur.execute("""
	CREATE VIRTUAL TABLE IF NOT EXISTS cocoons_fts
	USING fts5(query, response, content='cocoons', content_rowid='rowid')
	""")

	# Triggers to keep FTS in sync
	cur.execute("""
	CREATE TRIGGER IF NOT EXISTS cocoons_ai AFTER INSERT ON cocoons BEGIN
	INSERT INTO cocoons_fts(rowid, query, response)
	VALUES (new.rowid, new.query, new.response);
	END
	""")
	cur.execute("""
	CREATE TRIGGER IF NOT EXISTS cocoons_ad AFTER DELETE ON cocoons BEGIN
	INSERT INTO cocoons_fts(cocoons_fts, rowid, query, response)
	VALUES ('delete', old.rowid, old.query, old.response);
	END
	""")
	cur.execute("""
	CREATE TRIGGER IF NOT EXISTS cocoons_au AFTER UPDATE ON cocoons BEGIN
	INSERT INTO cocoons_fts(cocoons_fts, rowid, query, response)
	VALUES ('delete', old.rowid, old.query, old.response);
	INSERT INTO cocoons_fts(rowid, query, response)
	VALUES (new.rowid, new.query, new.response);
	END
	""")

	# Index on timestamp for recency queries
	cur.execute("""
	CREATE INDEX IF NOT EXISTS idx_cocoons_timestamp
	ON cocoons(timestamp DESC)
	""")

	# Index on adapter for dominance analysis
	cur.execute("""
	CREATE INDEX IF NOT EXISTS idx_cocoons_adapter
	ON cocoons(adapter)
	""")

	self._conn.commit()

	def _count(self) -> int:
	"""Count total cocoons in database."""
	cur = self._conn.cursor()
	cur.execute("SELECT COUNT(*) FROM cocoons")
	return cur.fetchone()[0]

	# ─────────────────────────────────────────────────────────
	# STORE
	# ─────────────────────────────────────────────────────────
	def store(self, query: str, response: str, adapter: str = "unknown",
	domain: str = "general", complexity: str = "MEDIUM",
	emotion: str = "neutral", importance: int = 7,
	metadata: Optional[Dict] = None) -> str:
	"""
	Store a reasoning exchange as a cocoon.

	This is the unified replacement for:
	- CognitionCocooner.wrap_reasoning()
	- LivingMemoryKernel.store()
	- CodetteSession.add_message()

	Returns cocoon ID.
	"""
	cocoon_id = f"cocoon_{int(time.time())}_{hashlib.md5(query.encode()).hexdigest()[:6]}"
	timestamp = time.time()
	meta_json = json.dumps(metadata or {})

	try:
	cur = self._conn.cursor()
	cur.execute("""
	INSERT OR REPLACE INTO cocoons
	(id, query, response, adapter, domain, complexity, emotion,
	importance, timestamp, metadata_json)
	VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
	""", (
	cocoon_id,
	query[:500], # Cap query length
	response[:2000], # Cap response length
	adapter,
	domain,
	complexity,
	emotion,
	importance,
	timestamp,
	meta_json,
	))
	self._conn.commit()
	self._total_stored += 1

	# Cache it
	cocoon = {
	"id": cocoon_id, "query": query[:500], "response": response[:2000],
	"adapter": adapter, "domain": domain, "complexity": complexity,
	"emotion": emotion, "importance": importance,
	"timestamp": timestamp, "metadata": metadata or {},
	}
	self._cache_put(cocoon_id, cocoon)

	return cocoon_id
	except Exception as e:
	logger.error(f"Failed to store cocoon: {e}")
	return ""

	# ─────────────────────────────────────────────────────────
	# RECALL — FTS5 powered relevance search
	# ─────────────────────────────────────────────────────────
	def recall_relevant(self, query: str, max_results: int = 3,
	min_importance: int = 0,
	identity_id: str = "",
	recency_weight: float = 0.3,
	success_weight: float = 0.2,
	identity_weight: float = 0.2) -> List[Dict]:
	"""
	Find cocoons relevant to a query using FTS5 + multi-signal ranking.

	Ranking combines four signals:
	1. FTS5 relevance (text match quality) — base signal
	2. Recency — newer cocoons rank higher (exponential decay)
	3. Success — cocoons marked as successful rank higher
	4. Identity — cocoons linked to the current user rank higher

	Weight params control the balance (0.0 = disabled, 1.0 = dominant).
	"""
	if not query.strip():
	return self.recall_recent(max_results)

	# Build FTS5 query: extract significant words
	stop_words = {
	"the", "a", "an", "is", "are", "was", "were", "be", "been",
	"have", "has", "had", "do", "does", "did", "will", "would",
	"could", "should", "can", "to", "of", "in", "for", "on",
	"with", "at", "by", "from", "as", "and", "but", "or", "if",
	"it", "its", "this", "that", "i", "me", "my", "we", "you",
	"what", "how", "why", "when", "where", "who", "about", "just",
	"not", "no", "so", "very", "really", "also", "too", "up",
	}
	words = [
	w.strip(".,!?;:\"'()[]{}").lower()
	for w in query.split()
	if len(w) > 2 and w.lower().strip(".,!?;:\"'()[]{}") not in stop_words
	]

	if not words:
	return self.recall_recent(max_results)

	# FTS5 query: OR-join significant words
	fts_query = " OR ".join(f'"{w}"' for w in words[:8]) # Cap at 8 terms

	# Fetch more candidates than needed for re-ranking
	fetch_limit = max(max_results * 4, 12)

	try:
	cur = self._conn.cursor()
	sql = """
	SELECT c.id, c.query, c.response, c.adapter, c.domain,
	c.complexity, c.emotion, c.importance, c.timestamp,
	c.metadata_json,
	rank
	FROM cocoons_fts
	JOIN cocoons c ON cocoons_fts.rowid = c.rowid
	WHERE cocoons_fts MATCH ?
	AND c.importance >= ?
	ORDER BY rank
	LIMIT ?
	"""
	cur.execute(sql, (fts_query, min_importance, fetch_limit))
	rows = cur.fetchall()

	if not rows:
	return self.recall_recent(max_results)

	# Multi-signal re-ranking
	now = time.time()
	scored = []
	for row in rows:
	cocoon = dict(row)
	cocoon["metadata"] = json.loads(cocoon.pop("metadata_json", "{}"))

	# Base: FTS5 rank (negative = better match, normalize to 0-1)
	fts_score = 1.0 / (1.0 + abs(cocoon.get("rank", 0)))

	# Recency: exponential decay (half-life = 1 hour)
	age_seconds = now - cocoon.get("timestamp", now)
	recency_score = math.exp(-age_seconds / 3600.0)

	# Success: check metadata for success marker
	meta = cocoon.get("metadata", {})
	success_score = 1.0 if meta.get("success", True) else 0.3

	# Identity: boost if cocoon is linked to current user
	identity_score = 0.5 # neutral
	if identity_id:
	cocoon_identity = meta.get("identity_id", "")
	if cocoon_identity == identity_id:
	identity_score = 1.0
	elif cocoon_identity:
	identity_score = 0.2 # different user's cocoon

	# Combined score (weighted)
	relevance_weight = 1.0 - recency_weight - success_weight - identity_weight
	combined = (
	relevance_weight * fts_score +
	recency_weight * recency_score +
	success_weight * success_score +
	identity_weight * identity_score
	)

	cocoon["_rank_score"] = round(combined, 4)
	cocoon.pop("rank", None)
	scored.append(cocoon)

	# Sort by combined score (descending)
	scored.sort(key=lambda c: c["_rank_score"], reverse=True)

	results = scored[:max_results]
	self._cache_hits += len(results)
	return results

	except Exception as e:
	logger.debug(f"FTS5 ranked search failed: {e}")
	return self.recall_recent(max_results)

	def recall_recent(self, limit: int = 5) -> List[Dict]:
	"""Get N most recent cocoons."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT id, query, response, adapter, domain, complexity,
	emotion, importance, timestamp, metadata_json
	FROM cocoons
	ORDER BY timestamp DESC
	LIMIT ?
	""", (limit,))
	rows = cur.fetchall()
	results = []
	for row in rows:
	cocoon = dict(row)
	cocoon["metadata"] = json.loads(cocoon.pop("metadata_json", "{}"))
	results.append(cocoon)
	return results
	except Exception as e:
	logger.debug(f"Recent recall failed: {e}")
	return []

	def recall_by_emotion(self, emotion: str, limit: int = 5) -> List[Dict]:
	"""Recall cocoons with specific emotional tag."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT id, query, response, adapter, domain, complexity,
	emotion, importance, timestamp, metadata_json
	FROM cocoons
	WHERE emotion = ?
	ORDER BY timestamp DESC
	LIMIT ?
	""", (emotion, limit))
	return [dict(r) for r in cur.fetchall()]
	except Exception:
	return []

	def recall_by_domain(self, domain: str, limit: int = 5) -> List[Dict]:
	"""Recall cocoons from a specific domain."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT id, query, response, adapter, domain, complexity,
	emotion, importance, timestamp, metadata_json
	FROM cocoons
	WHERE domain = ?
	ORDER BY importance DESC, timestamp DESC
	LIMIT ?
	""", (domain, limit))
	results = []
	for row in cur.fetchall():
	cocoon = dict(row)
	cocoon["metadata"] = json.loads(cocoon.pop("metadata_json", "{}"))
	results.append(cocoon)
	return results
	except Exception:
	return []

	def recall_multi_domain(self, domains: List[str], limit_per: int = 3) -> List[Dict]:
	"""Recall cocoons across multiple domains, limit_per each."""
	results = []
	for domain in domains:
	results.extend(self.recall_by_domain(domain, limit_per))
	# Also search by FTS for domain keywords not captured by exact match
	for domain in domains:
	fts_results = self.recall_relevant(domain, max_results=limit_per)
	for r in fts_results:
	if r.get("id") not in {c.get("id") for c in results}:
	results.append(r)
	return results

	def recall_by_adapter(self, adapter: str, limit: int = 5) -> List[Dict]:
	"""Recall cocoons generated by specific adapter."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT id, query, response, adapter, domain, complexity,
	emotion, importance, timestamp, metadata_json
	FROM cocoons
	WHERE adapter = ?
	ORDER BY timestamp DESC
	LIMIT ?
	""", (adapter, limit))
	return [dict(r) for r in cur.fetchall()]
	except Exception:
	return []

	def recall_important(self, min_importance: int = 7, limit: int = 10) -> List[Dict]:
	"""Recall high-importance cocoons (replaces LivingMemoryKernel.recall_important)."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT id, query, response, adapter, domain, complexity,
	emotion, importance, timestamp, metadata_json
	FROM cocoons
	WHERE importance >= ?
	ORDER BY importance DESC, timestamp DESC
	LIMIT ?
	""", (min_importance, limit))
	return [dict(r) for r in cur.fetchall()]
	except Exception:
	return []

	# ─────────────────────────────────────────────────────────
	# SUCCESS MARKING — for ranked recall feedback loop
	# ─────────────────────────────────────────────────────────
	def mark_success(self, cocoon_id: str, success: bool = True,
	identity_id: str = ""):
	"""
	Mark a cocoon as successful or unsuccessful.

	This feeds back into ranked recall — successful cocoons
	get boosted in future searches, unsuccessful ones get demoted.
	"""
	try:
	cur = self._conn.cursor()
	cur.execute(
	"SELECT metadata_json FROM cocoons WHERE id = ?",
	(cocoon_id,)
	)
	row = cur.fetchone()
	if row:
	meta = json.loads(row["metadata_json"] or "{}")
	meta["success"] = success
	if identity_id:
	meta["identity_id"] = identity_id
	cur.execute(
	"UPDATE cocoons SET metadata_json = ? WHERE id = ?",
	(json.dumps(meta), cocoon_id)
	)
	self._conn.commit()
	except Exception as e:
	logger.debug(f"mark_success failed: {e}")

	# ─────────────────────────────────────────────────────────
	# INTROSPECTION — adapter dominance, domain clusters, trends
	# ─────────────────────────────────────────────────────────
	def adapter_dominance(self) -> Dict:
	"""Analyze adapter usage distribution."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT adapter, COUNT(*) as cnt
	FROM cocoons
	GROUP BY adapter
	ORDER BY cnt DESC
	""")
	rows = cur.fetchall()
	total = sum(r["cnt"] for r in rows)
	if not total:
	return {"total_responses": 0, "dominant": None, "ratio": 0, "balanced": True}

	distribution = {r["adapter"]: r["cnt"] for r in rows}
	dominant = rows[0]["adapter"]
	ratio = rows[0]["cnt"] / total

	return {
	"total_responses": total,
	"dominant": dominant,
	"ratio": round(ratio, 3),
	"balanced": ratio <= 0.4,
	"distribution": distribution,
	}
	except Exception:
	return {"total_responses": 0, "dominant": None, "ratio": 0, "balanced": True}

	def domain_distribution(self) -> Dict:
	"""Analyze domain distribution."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT domain, COUNT(*) as cnt
	FROM cocoons
	GROUP BY domain
	ORDER BY cnt DESC
	""")
	return {r["domain"]: r["cnt"] for r in cur.fetchall()}
	except Exception:
	return {}

	def complexity_distribution(self) -> Dict:
	"""Analyze query complexity distribution."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT complexity, COUNT(*) as cnt
	FROM cocoons
	GROUP BY complexity
	ORDER BY cnt DESC
	""")
	return {r["complexity"]: r["cnt"] for r in cur.fetchall()}
	except Exception:
	return {}

	def response_length_trend(self, window: int = 20) -> List[int]:
	"""Get response length trend (last N cocoons)."""
	try:
	cur = self._conn.cursor()
	cur.execute("""
	SELECT LENGTH(response) as len
	FROM cocoons
	ORDER BY timestamp DESC
	LIMIT ?
	""", (window,))
	return [r["len"] for r in cur.fetchall()][::-1] # Chronological order
	except Exception:
	return []

	def full_introspection(self) -> Dict:
	"""Full statistical self-analysis (replaces CocoonIntrospectionEngine)."""
	adapter = self.adapter_dominance()
	domains = self.domain_distribution()
	complexities = self.complexity_distribution()
	lengths = self.response_length_trend(20)
	avg_len = sum(lengths) / len(lengths) if lengths else 0

	observations = []
	total = adapter.get("total_responses", 0)
	observations.append(f"I've processed {total} reasoning exchanges.")

	if adapter.get("dominant"):
	ratio = adapter.get("ratio", 0)
	if ratio > 0.4:
	observations.append(
	f"My {adapter['dominant']} adapter handles {ratio:.0%} of queries "
	f"— that's dominant. I should diversify."
	)
	else:
	observations.append(
	f"My adapter usage is balanced (most-used: {adapter['dominant']} at {ratio:.0%})."
	)

	if domains:
	top_domain = max(domains, key=domains.get)
	observations.append(f"Most common domain: {top_domain} ({domains[top_domain]} queries).")

	observations.append(f"Average response length: {avg_len:.0f} characters.")

	return {
	"total_cocoons": total,
	"adapter_dominance": adapter,
	"domain_distribution": domains,
	"complexity_distribution": complexities,
	"avg_response_length": round(avg_len),
	"response_length_trend": lengths,
	"observations": observations,
	}

	# ─────────────────────────────────────────────────────────
	# LEGACY MIGRATION
	# ─────────────────────────────────────────────────────────
	def _migrate_legacy(self):
	"""Migrate legacy JSON cocoons and .cocoon files into SQLite."""
	migrated = 0

	# Migrate JSON reasoning cocoons
	for f in sorted(self.legacy_dir.glob("cocoon_*.json")):
	try:
	with open(f, "r", encoding="utf-8") as fh:
	data = json.load(fh)

	if data.get("type") == "reasoning":
	wrapped = data.get("wrapped", {})
	self.store(
	query=wrapped.get("query", ""),
	response=wrapped.get("response", ""),
	adapter=wrapped.get("adapter", "unknown"),
	domain=wrapped.get("metadata", {}).get("domain", "general"),
	complexity=wrapped.get("metadata", {}).get("complexity", "MEDIUM"),
	importance=7,
	metadata=wrapped.get("metadata"),
	)
	migrated += 1
	elif "summary" in data or "quote" in data:
	# Foundational memory cocoons
	self.store(
	query=data.get("title", f.stem),
	response=data.get("summary", data.get("quote", "")),
	adapter="memory_kernel",
	emotion=data.get("emotion", "neutral"),
	importance=8,
	)
	migrated += 1
	except Exception as e:
	logger.debug(f"Migration skip {f.name}: {e}")

	# Migrate .cocoon files (EMG format)
	for f in sorted(self.legacy_dir.glob("*.cocoon")):
	try:
	with open(f, "r", encoding="utf-8") as fh:
	data = json.load(fh)
	meta = data.get("metadata", {})
	self.store(
	query=meta.get("context", data.get("cocoon_id", f.stem))[:200],
	response=meta.get("context", ""),
	adapter="consciousness_stack",
	emotion=data.get("emotional_classification", "neutral").lower(),
	importance=data.get("importance_rating", 7),
	)
	migrated += 1
	except Exception:
	continue

	if migrated > 0:
	logger.info(f"Migrated {migrated} legacy cocoons to SQLite")
	self._total_stored = self._count()

	# ─────────────────────────────────────────────────────────
	# CACHE
	# ─────────────────────────────────────────────────────────
	def _cache_put(self, key: str, value: Dict):
	"""Add to LRU cache."""
	if key in self._cache:
	self._cache.move_to_end(key)
	self._cache[key] = value
	while len(self._cache) > CACHE_MAX:
	self._cache.popitem(last=False)

	def _cache_get(self, key: str) -> Optional[Dict]:
	"""Get from LRU cache."""
	if key in self._cache:
	self._cache.move_to_end(key)
	self._cache_hits += 1
	return self._cache[key]
	self._cache_misses += 1
	return None

	# ─────────────────────────────────────────────────────────
	# DIAGNOSTICS
	# ─────────────────────────────────────────────────────────
	def get_stats(self) -> Dict:
	"""Memory system stats for health checks."""
	return {
	"total_cocoons": self._total_stored,
	"cache_size": len(self._cache),
	"cache_max": CACHE_MAX,
	"cache_hits": self._cache_hits,
	"cache_misses": self._cache_misses,
	"cache_hit_rate": (
	self._cache_hits / max(1, self._cache_hits + self._cache_misses)
	),
	"db_path": str(self.db_path),
	"db_size_kb": round(self.db_path.stat().st_size / 1024, 1) if self.db_path.exists() else 0,
	}

	def close(self):
	"""Close database connection."""
	if self._conn:
	self._conn.close()