| """ |
| Token-Level Confidence Scoring Engine |
| |
| Synthesizes four confidence signals to rate individual claims: |
| 1. Semantic Confidence — Confidence markers in text ("I'm confident that...", "arguably...") |
| 2. Attentional Confidence — Semantic overlap with other agents' responses |
| 3. Probabilistic Confidence — Token-level probabilities from LLM logits |
| 4. Integrated Learning Signal — Historical coherence from past similar responses |
| |
| Author: Claude Code |
| """ |
|
|
| import re |
| import time |
| import json |
| from dataclasses import dataclass, asdict |
| from typing import Dict, List, Optional, Tuple, Any |
| from collections import defaultdict |
| import hashlib |
| import logging |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| CONFIDENCE_MARKERS = { |
| "high": [ |
| r"\bi['\"]?m confident\b", |
| r"\bdefinitively\b", |
| r"\bclearly\b", |
| r"\bunambiguously\b", |
| r"\bcertainly\b", |
| r"\bwithout doubt\b", |
| r"\bno question\b", |
| r"\bproven\b", |
| r"\bestablished fact\b", |
| ], |
| "medium": [ |
| r"\bi argue\b", |
| r"\b(it appears|it seems)\b", |
| r"\breasonably\b", |
| r"\barguably\b", |
| r"\blikely\b", |
| r"\bprobably\b", |
| r"\bin my view\b", |
| r"\bi think\b", |
| r"\bi believe\b", |
| r"\bfrom my perspective\b", |
| ], |
| "low": [ |
| r"\b(it['\"]?s possible|it could be)\b", |
| r"\bone could say\b", |
| r"\bperhaps\b", |
| r"\bmaybe\b", |
| r"\buncertain\b", |
| r"\bi['\"]?m not sure\b", |
| r"\ballegedly\b", |
| r"\bseemingly\b", |
| r"\bapparently\b", |
| r"\bwhoa\b", |
| ], |
| } |
|
|
| |
| _MARKER_PATTERNS = {} |
| for level, markers in CONFIDENCE_MARKERS.items(): |
| _MARKER_PATTERNS[level] = [re.compile(m, re.IGNORECASE) for m in markers] |
|
|
|
|
| @dataclass |
| class ClaimSegment: |
| """A single claim extracted from an agent's response.""" |
|
|
| text: str |
| start_idx: int |
| end_idx: int |
| confidence: float |
| semantic_conf: float |
| attentional_conf: float |
| probabilistic_conf: float |
| learning_signal: float |
| agent_name: str = "" |
| debate_round: int = 0 |
|
|
|
|
| @dataclass |
| class TokenConfidenceScore: |
| """Per-token confidence analysis for a full response.""" |
|
|
| agent_name: str |
| response_text: str |
| token_scores: List[float] |
| claims: List[ClaimSegment] |
| semantic_confidence_dict: Dict[int, float] |
| attentional_confidence_dict: Dict[int, float] |
| probabilistic_confidence_dict: Dict[int, float] |
| learning_signal_dict: Dict[int, float] |
| composite_scores: Dict[int, float] |
| timestamp: float = 0.0 |
|
|
| def __post_init__(self): |
| if self.timestamp == 0.0: |
| self.timestamp = time.time() |
|
|
| def to_dict(self) -> Dict: |
| """Serialize for storage.""" |
| return { |
| "agent_name": self.agent_name, |
| "response_text": self.response_text[:500], |
| "mean_token_confidence": sum(self.token_scores) / max(len(self.token_scores), 1), |
| "claims_count": len(self.claims), |
| "claims": [ |
| { |
| "text": c.text, |
| "confidence": c.confidence, |
| "semantic_conf": c.semantic_conf, |
| "attentional_conf": c.attentional_conf, |
| "probabilistic_conf": c.probabilistic_conf, |
| "learning_signal": c.learning_signal, |
| } |
| for c in self.claims |
| ], |
| } |
|
|
|
|
| class TokenConfidenceEngine: |
| """Four-signal token confidence scorer.""" |
|
|
| def __init__( |
| self, |
| embedding_model: Optional[Any] = None, |
| living_memory: Optional[Any] = None, |
| alpha: float = 0.25, |
| beta: float = 0.25, |
| gamma: float = 0.25, |
| delta: float = 0.25, |
| ): |
| """ |
| Initialize token confidence engine. |
| |
| Args: |
| embedding_model: Model for generating embeddings (optional, uses sklearn if None) |
| living_memory: LivingMemoryKernel instance for historical coherence lookup |
| alpha: Weight for semantic confidence |
| beta: Weight for attentional confidence |
| gamma: Weight for probabilistic confidence |
| delta: Weight for learning signal |
| """ |
| self.embedding_model = embedding_model |
| self.living_memory = living_memory |
| self.alpha = alpha |
| self.beta = beta |
| self.gamma = gamma |
| self.delta = delta |
|
|
| |
| self._embedder = None |
| self._embedder_cache = {} |
|
|
| def score_tokens( |
| self, |
| agent_response: str, |
| agent_name: str, |
| peer_responses: Optional[Dict[str, str]] = None, |
| logits: Optional[List[float]] = None, |
| ) -> TokenConfidenceScore: |
| """ |
| Score all tokens/claims in an agent's response using 4 signals. |
| |
| Args: |
| agent_response: The full response text from the agent |
| agent_name: Name of the agent (for memory lookup) |
| peer_responses: Dict {peer_agent_name: response_text} for attentional scoring |
| logits: Optional list of per-token probabilities from generation |
| |
| Returns: |
| TokenConfidenceScore with all components |
| """ |
| if peer_responses is None: |
| peer_responses = {} |
|
|
| |
| semantic_conf_dict = self._parse_semantic_markers(agent_response) |
|
|
| |
| attentional_conf_dict = self._compute_attentional_confidence( |
| agent_response, peer_responses |
| ) |
|
|
| |
| probabilistic_conf_dict = self._extract_probabilistic_confidence( |
| agent_response, logits |
| ) |
|
|
| |
| learning_signal_dict = self._compute_learning_signal( |
| agent_response, agent_name |
| ) |
|
|
| |
| claims = self._extract_claims( |
| agent_response, |
| semantic_conf_dict, |
| attentional_conf_dict, |
| probabilistic_conf_dict, |
| learning_signal_dict, |
| agent_name, |
| ) |
|
|
| |
| token_scores = [] |
| composite_scores = {} |
|
|
| for i, token_text in enumerate(agent_response.split()): |
| semantic = semantic_conf_dict.get(i, 0.5) |
| attentional = attentional_conf_dict.get(i, 0.5) |
| probabilistic = probabilistic_conf_dict.get(i, 0.5) |
| learning = learning_signal_dict.get(i, 0.5) |
|
|
| |
| composite = ( |
| self.alpha * semantic |
| + self.beta * attentional |
| + self.gamma * probabilistic |
| + self.delta * learning |
| ) |
| composite = max(0.0, min(1.0, composite)) |
|
|
| token_scores.append(composite) |
| composite_scores[i] = composite |
|
|
| return TokenConfidenceScore( |
| agent_name=agent_name, |
| response_text=agent_response, |
| token_scores=token_scores, |
| claims=claims, |
| semantic_confidence_dict=semantic_conf_dict, |
| attentional_confidence_dict=attentional_conf_dict, |
| probabilistic_confidence_dict=probabilistic_conf_dict, |
| learning_signal_dict=learning_signal_dict, |
| composite_scores=composite_scores, |
| ) |
|
|
| def _parse_semantic_markers(self, response: str) -> Dict[int, float]: |
| """ |
| Parse confidence markers from text. |
| |
| Returns: |
| Dict mapping token_idx to confidence [0, 1] |
| """ |
| conf_dict = {} |
| tokens = response.split() |
|
|
| |
| for level, confidence_level in [("high", 0.9), ("medium", 0.6), ("low", 0.3)]: |
| for pattern in _MARKER_PATTERNS[level]: |
| for match in pattern.finditer(response): |
| |
| char_pos = match.start() |
| char_count = 0 |
| for token_idx, token in enumerate(tokens): |
| if char_count <= char_pos < char_count + len(token): |
| |
| for nearby_idx in range( |
| max(0, token_idx - 1), min(len(tokens), token_idx + 4) |
| ): |
| if nearby_idx not in conf_dict: |
| conf_dict[nearby_idx] = confidence_level |
| else: |
| |
| conf_dict[nearby_idx] = max( |
| conf_dict[nearby_idx], confidence_level |
| ) |
| break |
| char_count += len(token) + 1 |
|
|
| |
| for i in range(len(tokens)): |
| if i not in conf_dict: |
| conf_dict[i] = 0.5 |
|
|
| return conf_dict |
|
|
| def _compute_attentional_confidence( |
| self, agent_response: str, peer_responses: Dict[str, str] |
| ) -> Dict[int, float]: |
| """ |
| Compute attentional confidence via semantic overlap with peers. |
| |
| High overlap = higher confidence (claim addresses peer perspectives) |
| |
| Returns: |
| Dict mapping token_idx to confidence [0.3, 1.0] |
| """ |
| conf_dict = {} |
| tokens = agent_response.split() |
|
|
| if not peer_responses: |
| |
| for i in range(len(tokens)): |
| conf_dict[i] = 0.5 |
| return conf_dict |
|
|
| |
| token_overlaps = defaultdict(list) |
|
|
| for peer_name, peer_response in peer_responses.items(): |
| peer_tokens_set = set(peer_response.lower().split()) |
|
|
| for token_idx, token in enumerate(tokens): |
| |
| if token.lower() in peer_tokens_set: |
| token_overlaps[token_idx].append(1.0) |
| elif any( |
| token.lower().startswith(p[:3]) or p.startswith(token.lower()[:3]) |
| for p in peer_tokens_set |
| ): |
| |
| token_overlaps[token_idx].append(0.6) |
|
|
| |
| for i in range(len(tokens)): |
| if token_overlaps[i]: |
| overlap_score = sum(token_overlaps[i]) / len(token_overlaps[i]) |
| else: |
| overlap_score = 0.0 |
|
|
| |
| attentional_conf = 0.3 + 0.7 * overlap_score |
| conf_dict[i] = attentional_conf |
|
|
| return conf_dict |
|
|
| def _extract_probabilistic_confidence( |
| self, response: str, logits: Optional[List[float]] = None |
| ) -> Dict[int, float]: |
| """ |
| Extract per-token probabilities from logits. |
| |
| If logits not provided, use fallback heuristic (all 0.5). |
| |
| Returns: |
| Dict mapping token_idx to probability [0, 1] |
| """ |
| conf_dict = {} |
| tokens = response.split() |
|
|
| if logits and len(logits) == len(tokens): |
| |
| for i, prob in enumerate(logits): |
| conf_dict[i] = max(0.0, min(1.0, prob)) |
| else: |
| |
| common_words = { |
| "the", |
| "a", |
| "is", |
| "and", |
| "or", |
| "of", |
| "to", |
| "in", |
| "that", |
| "it", |
| } |
| for i, token in enumerate(tokens): |
| if token.lower() in common_words: |
| conf_dict[i] = 0.9 |
| elif len(token) > 3: |
| conf_dict[i] = 0.6 |
| else: |
| conf_dict[i] = 0.5 |
|
|
| return conf_dict |
|
|
| def _compute_learning_signal( |
| self, response: str, agent_name: str |
| ) -> Dict[int, float]: |
| """ |
| Compute learning signal from historical coherence (Phase 2 enhancement). |
| |
| Query memory for similar past responses and boost confidence if |
| they led to high coherence. Recent memories are weighted higher. |
| |
| Returns: |
| Dict mapping token_idx to learning signal [0.5, 1.0] |
| |
| Phase 2: Now includes recency weighting with ~7 day half-life |
| """ |
| import math |
|
|
| conf_dict = {} |
| tokens = response.split() |
|
|
| |
| if not self.living_memory: |
| for i in range(len(tokens)): |
| conf_dict[i] = 0.5 |
| return conf_dict |
|
|
| |
| try: |
| similar_cocoons = self.living_memory.recall_by_adapter( |
| agent_name, limit=10 |
| ) |
| if not similar_cocoons: |
| avg_coherence = 0.5 |
| else: |
| |
| |
| recency_weights = [] |
| weighted_coherences = [] |
|
|
| for cocoon in similar_cocoons: |
| age_hours = cocoon.age_hours() |
| |
| recency_weight = math.exp(-age_hours / 168.0) |
| recency_weights.append(recency_weight) |
| weighted_coherences.append(cocoon.coherence * recency_weight) |
|
|
| |
| total_weight = sum(recency_weights) |
| if total_weight > 0: |
| avg_coherence = sum(weighted_coherences) / total_weight |
| else: |
| avg_coherence = 0.5 |
|
|
| except Exception as e: |
| logger.warning(f"Error retrieving memory for {agent_name}: {e}") |
| avg_coherence = 0.5 |
|
|
| |
| |
| learning_signal = 0.5 + 0.5 * avg_coherence |
|
|
| for i in range(len(tokens)): |
| conf_dict[i] = learning_signal |
|
|
| return conf_dict |
|
|
| def _extract_claims( |
| self, |
| response: str, |
| semantic_conf_dict: Dict[int, float], |
| attentional_conf_dict: Dict[int, float], |
| probabilistic_conf_dict: Dict[int, float], |
| learning_signal_dict: Dict[int, float], |
| agent_name: str, |
| ) -> List[ClaimSegment]: |
| """ |
| Extract individual claims (sentences/clauses) from response. |
| |
| Returns: |
| List of ClaimSegment with aggregate confidence from component signals |
| """ |
| claims = [] |
|
|
| |
| sentence_pattern = re.compile(r"[.!?]+") |
| sentences = sentence_pattern.split(response) |
|
|
| token_idx = 0 |
| start_char_idx = 0 |
|
|
| for sentence in sentences: |
| if not sentence.strip(): |
| continue |
|
|
| sentence_tokens = sentence.split() |
| sentence_token_indices = list(range(token_idx, token_idx + len(sentence_tokens))) |
| token_idx += len(sentence_tokens) |
|
|
| |
| if sentence_token_indices: |
| semantic = sum( |
| semantic_conf_dict.get(i, 0.5) for i in sentence_token_indices |
| ) / len(sentence_token_indices) |
| attentional = sum( |
| attentional_conf_dict.get(i, 0.5) for i in sentence_token_indices |
| ) / len(sentence_token_indices) |
| probabilistic = sum( |
| probabilistic_conf_dict.get(i, 0.5) for i in sentence_token_indices |
| ) / len(sentence_token_indices) |
| learning = sum( |
| learning_signal_dict.get(i, 0.5) for i in sentence_token_indices |
| ) / len(sentence_token_indices) |
|
|
| composite_confidence = ( |
| self.alpha * semantic |
| + self.beta * attentional |
| + self.gamma * probabilistic |
| + self.delta * learning |
| ) |
| composite_confidence = max(0.0, min(1.0, composite_confidence)) |
|
|
| claim = ClaimSegment( |
| text=sentence.strip(), |
| start_idx=start_char_idx, |
| end_idx=start_char_idx + len(sentence), |
| confidence=composite_confidence, |
| semantic_conf=semantic, |
| attentional_conf=attentional, |
| probabilistic_conf=probabilistic, |
| learning_signal=learning, |
| agent_name=agent_name, |
| ) |
| claims.append(claim) |
|
|
| start_char_idx += len(sentence) + 1 |
|
|
| return claims |
|
|