"""Evaluation metrics suite. All metrics stratified by Fitzpatrick skin type (I-VI) using ITA-based thresholding. Primary metrics: FID, LPIPS, NME, ArcFace identity similarity. Secondary: SSIM (relaxed target >0.80). """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any import numpy as np try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] @dataclass class EvalMetrics: """Computed evaluation metrics for a batch of generated images.""" fid: float = 0.0 lpips: float = 0.0 nme: float = 0.0 # Normalized Mean landmark Error identity_sim: float = 0.0 # ArcFace cosine similarity ssim: float = 0.0 # Per-Fitzpatrick breakdown (all metrics stratified) fid_by_fitzpatrick: dict[str, float] = field(default_factory=dict) nme_by_fitzpatrick: dict[str, float] = field(default_factory=dict) lpips_by_fitzpatrick: dict[str, float] = field(default_factory=dict) ssim_by_fitzpatrick: dict[str, float] = field(default_factory=dict) identity_sim_by_fitzpatrick: dict[str, float] = field(default_factory=dict) count_by_fitzpatrick: dict[str, int] = field(default_factory=dict) # Per-procedure breakdown nme_by_procedure: dict[str, float] = field(default_factory=dict) lpips_by_procedure: dict[str, float] = field(default_factory=dict) ssim_by_procedure: dict[str, float] = field(default_factory=dict) def summary(self) -> str: lines = [ f"FID: {self.fid:.2f}", f"LPIPS: {self.lpips:.4f}", f"NME: {self.nme:.4f}", f"Identity Sim: {self.identity_sim:.4f}", f"SSIM: {self.ssim:.4f}", ] if self.count_by_fitzpatrick: lines.append("\nBy Fitzpatrick Type:") for ftype in sorted(self.count_by_fitzpatrick): n = self.count_by_fitzpatrick[ftype] parts = [f" Type {ftype} (n={n}):"] if ftype in self.lpips_by_fitzpatrick: parts.append(f"LPIPS={self.lpips_by_fitzpatrick[ftype]:.4f}") if ftype in self.ssim_by_fitzpatrick: parts.append(f"SSIM={self.ssim_by_fitzpatrick[ftype]:.4f}") if ftype in self.nme_by_fitzpatrick: parts.append(f"NME={self.nme_by_fitzpatrick[ftype]:.4f}") if ftype in self.identity_sim_by_fitzpatrick: parts.append(f"ID={self.identity_sim_by_fitzpatrick[ftype]:.4f}") lines.append(" ".join(parts)) if self.fid_by_fitzpatrick: lines.append("\nFID by Fitzpatrick:") for k, v in sorted(self.fid_by_fitzpatrick.items()): lines.append(f" Type {k}: {v:.2f}") return "\n".join(lines) def to_dict(self) -> dict: """Convert to flat dictionary for JSON/CSV export.""" d = { "fid": self.fid, "lpips": self.lpips, "nme": self.nme, "identity_sim": self.identity_sim, "ssim": self.ssim, } for ftype in sorted(self.count_by_fitzpatrick): prefix = f"fitz_{ftype}" d[f"{prefix}_count"] = self.count_by_fitzpatrick.get(ftype, 0) d[f"{prefix}_lpips"] = self.lpips_by_fitzpatrick.get(ftype, 0.0) d[f"{prefix}_ssim"] = self.ssim_by_fitzpatrick.get(ftype, 0.0) d[f"{prefix}_nme"] = self.nme_by_fitzpatrick.get(ftype, 0.0) d[f"{prefix}_identity"] = self.identity_sim_by_fitzpatrick.get(ftype, 0.0) for proc in sorted(self.nme_by_procedure): d[f"proc_{proc}_nme"] = self.nme_by_procedure.get(proc, 0.0) d[f"proc_{proc}_lpips"] = self.lpips_by_procedure.get(proc, 0.0) d[f"proc_{proc}_ssim"] = self.ssim_by_procedure.get(proc, 0.0) return d def classify_fitzpatrick_ita(image: np.ndarray) -> str: """Classify Fitzpatrick skin type using Individual Typology Angle (ITA). ITA = arctan((L - 50) / b) * (180 / pi) where L, b are from CIE L*a*b* color space. Thresholds from Chardon et al. (1991): - ITA > 55: Type I (very light) - 41 < ITA <= 55: Type II (light) - 28 < ITA <= 41: Type III (intermediate) - 10 < ITA <= 28: Type IV (tan) - -30 < ITA <= 10: Type V (brown) - ITA <= -30: Type VI (dark) """ if cv2 is None: raise ImportError("opencv-python is required for Fitzpatrick classification") lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB).astype(np.float32) # Sample from face center region (avoid background) h, w = image.shape[:2] center = lab[h // 4 : 3 * h // 4, w // 4 : 3 * w // 4] L_mean = center[:, :, 0].mean() * 100 / 255 # scale to 0-100 b_mean = center[:, :, 2].mean() - 128 # center around 0 if abs(b_mean) < 1e-6: b_mean = 1e-6 ita = np.arctan2(L_mean - 50, b_mean) * (180 / np.pi) if ita > 55: return "I" elif ita > 41: return "II" elif ita > 28: return "III" elif ita > 10: return "IV" elif ita > -30: return "V" else: return "VI" def compute_nme( pred_landmarks: np.ndarray, target_landmarks: np.ndarray, left_eye_idx: int = 33, right_eye_idx: int = 263, ) -> float: """Compute Normalized Mean Error for landmarks. Normalized by inter-ocular distance. Args: pred_landmarks: (N, 2) predicted landmark positions. target_landmarks: (N, 2) ground truth positions. left_eye_idx: MediaPipe index for left eye center. right_eye_idx: MediaPipe index for right eye center. Returns: NME value (lower is better). """ iod = np.linalg.norm( target_landmarks[left_eye_idx] - target_landmarks[right_eye_idx] ) if iod < 1.0: iod = 1.0 distances = np.linalg.norm(pred_landmarks - target_landmarks, axis=1) return float(np.mean(distances) / iod) def compute_ssim( pred: np.ndarray, target: np.ndarray, ) -> float: """Compute Structural Similarity Index (SSIM). Uses scikit-image's windowed SSIM (Wang et al. 2004) for proper per-window computation with 11x11 Gaussian kernel. """ try: from skimage.metrics import structural_similarity # Convert to grayscale if color, or compute per-channel if pred.ndim == 3 and pred.shape[2] == 3: return float(structural_similarity(pred, target, channel_axis=2, data_range=255)) else: return float(structural_similarity(pred, target, data_range=255)) except ImportError: # Fallback: simple global SSIM (not publication-quality) pred_f = pred.astype(np.float64) target_f = target.astype(np.float64) mu_p = np.mean(pred_f) mu_t = np.mean(target_f) sigma_p = np.std(pred_f) sigma_t = np.std(target_f) sigma_pt = np.mean((pred_f - mu_p) * (target_f - mu_t)) C1 = (0.01 * 255) ** 2 C2 = (0.03 * 255) ** 2 ssim_val = ( (2 * mu_p * mu_t + C1) * (2 * sigma_pt + C2) ) / ( (mu_p ** 2 + mu_t ** 2 + C1) * (sigma_p ** 2 + sigma_t ** 2 + C2) ) return float(ssim_val) _LPIPS_FN = None _ARCFACE_APP = None def _get_lpips_fn() -> Any: """Get or create singleton LPIPS model.""" global _LPIPS_FN if _LPIPS_FN is None: import lpips _LPIPS_FN = lpips.LPIPS(net="alex", verbose=False) _LPIPS_FN.eval() return _LPIPS_FN def compute_lpips( pred: np.ndarray, target: np.ndarray, ) -> float: """Compute LPIPS perceptual distance between two images. Returns LPIPS score (lower = more similar). """ try: import lpips # noqa: F401 — availability check; used in _get_lpips_fn import torch except ImportError: return float("nan") _lpips_fn = _get_lpips_fn() def _to_tensor(img: np.ndarray) -> torch.Tensor: t = torch.from_numpy(img.astype(np.float32) / 255.0).permute(2, 0, 1).unsqueeze(0) return t * 2 - 1 # LPIPS expects [-1, 1] with torch.no_grad(): score = _lpips_fn(_to_tensor(pred), _to_tensor(target)) return float(score.item()) def compute_fid( real_dir: str, generated_dir: str, ) -> float: """Compute FID between directories of real and generated images. Uses torch-fidelity for GPU-accelerated computation. Args: real_dir: Path to directory of real images. generated_dir: Path to directory of generated images. Returns: FID score (lower = more similar distributions). """ try: from torch_fidelity import calculate_metrics except ImportError as e: raise ImportError( "torch-fidelity is required for FID. Install with: pip install torch-fidelity" ) from e import torch metrics = calculate_metrics( input1=generated_dir, input2=real_dir, cuda=torch.cuda.is_available(), fid=True, verbose=False, ) return float(metrics["frechet_inception_distance"]) def compute_identity_similarity( pred: np.ndarray, target: np.ndarray, ) -> float: """Compute ArcFace identity cosine similarity between two face images. Returns cosine similarity [0, 1] where 1 = identical identity. Falls back to SSIM-based proxy if InsightFace unavailable. """ try: from insightface.app import FaceAnalysis global _ARCFACE_APP if _ARCFACE_APP is None: _ARCFACE_APP = FaceAnalysis( name="buffalo_l", providers=["CUDAExecutionProvider", "CPUExecutionProvider"], ) _ARCFACE_APP.prepare(ctx_id=-1, det_size=(320, 320)) app = _ARCFACE_APP pred_bgr = pred if pred.shape[2] == 3 else cv2.cvtColor(pred, cv2.COLOR_RGB2BGR) target_bgr = target if target.shape[2] == 3 else cv2.cvtColor(target, cv2.COLOR_RGB2BGR) pred_faces = app.get(pred_bgr) target_faces = app.get(target_bgr) if pred_faces and target_faces: pred_emb = pred_faces[0].embedding target_emb = target_faces[0].embedding sim = np.dot(pred_emb, target_emb) / ( np.linalg.norm(pred_emb) * np.linalg.norm(target_emb) + 1e-8 ) return float(np.clip(sim, 0, 1)) except Exception: pass # Fallback: SSIM-based proxy return compute_ssim(pred, target) # ------------------------------------------------------------------ # Geometric nasal ratios (adapted from Varghaei et al., arXiv:2508.13363) # ------------------------------------------------------------------ # MediaPipe 478-point indices for facial measurements _LEFT_ALAR = 129 # left alar (nose wing) outermost point _RIGHT_ALAR = 358 # right alar _NOSE_TIP = 1 # pronasale _NOSE_BRIDGE_TOP = 168 # nasion (bridge root) _LEFT_INNER_CANTHUS = 133 _RIGHT_INNER_CANTHUS = 362 _LEFT_TRAGION = 234 # left ear (face width proxy) _RIGHT_TRAGION = 454 # right ear _FOREHEAD = 10 # trichion / upper face _CHIN = 152 # menton / lowest chin point def compute_nasal_ratios( landmarks: np.ndarray, ) -> dict[str, float]: """Compute 5 nasal geometric ratios from MediaPipe 478-point landmarks. Ratios from Varghaei et al. (2025), used clinically to assess rhinoplasty outcomes. All ratios are dimensionless. Args: landmarks: (478, 2) or (478, 3) landmark pixel coordinates. Returns: Dict with keys: alar_face_ratio, nose_face_ratio, alar_intercanthal_ratio, tip_deviation, nostril_asymmetry. """ pts = landmarks[:, :2] # use only x,y alar_width = np.linalg.norm(pts[_LEFT_ALAR] - pts[_RIGHT_ALAR]) face_width = np.linalg.norm(pts[_LEFT_TRAGION] - pts[_RIGHT_TRAGION]) nose_length = np.linalg.norm(pts[_NOSE_BRIDGE_TOP] - pts[_NOSE_TIP]) face_height = np.linalg.norm(pts[_FOREHEAD] - pts[_CHIN]) intercanthal = np.linalg.norm( pts[_LEFT_INNER_CANTHUS] - pts[_RIGHT_INNER_CANTHUS] ) # Midline: midpoint between inner canthi midline_x = (pts[_LEFT_INNER_CANTHUS][0] + pts[_RIGHT_INNER_CANTHUS][0]) / 2 tip_deviation = abs(pts[_NOSE_TIP][0] - midline_x) / (face_width + 1e-8) # Nostril asymmetry: difference in left/right alar-to-tip distances left_dist = np.linalg.norm(pts[_LEFT_ALAR] - pts[_NOSE_TIP]) right_dist = np.linalg.norm(pts[_RIGHT_ALAR] - pts[_NOSE_TIP]) nostril_asymmetry = abs(left_dist - right_dist) / (alar_width + 1e-8) return { "alar_face_ratio": float(alar_width / (face_width + 1e-8)), "nose_face_ratio": float(nose_length / (face_height + 1e-8)), "alar_intercanthal_ratio": float(alar_width / (intercanthal + 1e-8)), "tip_deviation": float(tip_deviation), "nostril_asymmetry": float(nostril_asymmetry), } def compute_bilateral_symmetry( landmarks: np.ndarray, ) -> float: """Compute bilateral facial symmetry score from landmarks. Reflects each left-side landmark across the vertical midline and measures average displacement from the corresponding right-side point. Normalized by inter-ocular distance. Based on KDTree approach from Varghaei et al. (2025). Args: landmarks: (478, 2) or (478, 3) landmark pixel coordinates. Returns: Symmetry score in [0, 1] where 1 = perfect symmetry. """ pts = landmarks[:, :2] # Midline from forehead to chin midline_x = (pts[_LEFT_TRAGION][0] + pts[_RIGHT_TRAGION][0]) / 2 iod = np.linalg.norm(pts[33] - pts[263]) # inter-ocular distance if iod < 1.0: iod = 1.0 # MediaPipe left-right correspondence pairs (subset of reliable pairs) # format: (left_idx, right_idx) sym_pairs = [ (33, 263), # outer canthi (133, 362), # inner canthi (70, 300), # eyebrow inner (105, 334), # eyebrow outer (129, 358), # alar (61, 291), # mouth corners (234, 454), # tragion (93, 323), # cheekbone (132, 361), # lower eyelid (159, 386), # upper eyelid (58, 288), # lower lip (172, 397), # chin lateral (136, 365), # nose lateral (48, 278), # nostril ] diffs = [] for left_idx, right_idx in sym_pairs: # Reflect left point across midline reflected_x = 2 * midline_x - pts[left_idx][0] reflected = np.array([reflected_x, pts[left_idx][1]]) diff = np.linalg.norm(reflected - pts[right_idx]) / iod diffs.append(diff) mean_asymmetry = np.mean(diffs) # Convert to 0-1 symmetry score (asymmetry of 0 = score of 1) return float(np.clip(1.0 - mean_asymmetry, 0.0, 1.0)) def evaluate_batch( predictions: list[np.ndarray], targets: list[np.ndarray], pred_landmarks: list[np.ndarray] | None = None, target_landmarks: list[np.ndarray] | None = None, procedures: list[str] | None = None, compute_identity: bool = False, ) -> EvalMetrics: """Evaluate a batch of predicted vs target images. Computes all metrics and stratifies by Fitzpatrick skin type and procedure. Args: predictions: List of predicted BGR images. targets: List of target BGR images. pred_landmarks: Optional list of (N, 2) predicted landmark arrays. target_landmarks: Optional list of (N, 2) target landmark arrays. procedures: Optional list of procedure names for per-procedure breakdown. compute_identity: Whether to compute ArcFace identity similarity (slow). Returns: EvalMetrics with all computed values. """ n = len(predictions) ssim_scores = [] lpips_scores = [] nme_scores = [] identity_scores = [] fitz_groups: dict[str, list[int]] = {} proc_groups: dict[str, list[int]] = {} for i in range(n): ssim_scores.append(compute_ssim(predictions[i], targets[i])) lpips_scores.append(compute_lpips(predictions[i], targets[i])) if pred_landmarks is not None and target_landmarks is not None: nme_scores.append(compute_nme(pred_landmarks[i], target_landmarks[i])) if compute_identity: identity_scores.append(compute_identity_similarity(predictions[i], targets[i])) # Fitzpatrick classification if cv2 is not None: try: fitz = classify_fitzpatrick_ita(targets[i]) fitz_groups.setdefault(fitz, []).append(i) except Exception: pass # Procedure grouping if procedures is not None and i < len(procedures): proc_groups.setdefault(procedures[i], []).append(i) metrics = EvalMetrics( ssim=float(np.nanmean(ssim_scores)) if ssim_scores else 0.0, lpips=float(np.nanmean(lpips_scores)) if lpips_scores else 0.0, nme=float(np.nanmean(nme_scores)) if nme_scores else 0.0, identity_sim=float(np.nanmean(identity_scores)) if identity_scores else 0.0, ) # Full Fitzpatrick stratification for ALL metrics for ftype, indices in fitz_groups.items(): metrics.count_by_fitzpatrick[ftype] = len(indices) group_lpips = [lpips_scores[i] for i in indices] if group_lpips: metrics.lpips_by_fitzpatrick[ftype] = float(np.nanmean(group_lpips)) group_ssim = [ssim_scores[i] for i in indices] if group_ssim: metrics.ssim_by_fitzpatrick[ftype] = float(np.nanmean(group_ssim)) if nme_scores: group_nme = [nme_scores[i] for i in indices if i < len(nme_scores)] if group_nme: metrics.nme_by_fitzpatrick[ftype] = float(np.nanmean(group_nme)) if identity_scores: group_id = [identity_scores[i] for i in indices if i < len(identity_scores)] if group_id: metrics.identity_sim_by_fitzpatrick[ftype] = float(np.nanmean(group_id)) # Per-procedure breakdown for proc, indices in proc_groups.items(): group_lpips = [lpips_scores[i] for i in indices] if group_lpips: metrics.lpips_by_procedure[proc] = float(np.nanmean(group_lpips)) group_ssim = [ssim_scores[i] for i in indices] if group_ssim: metrics.ssim_by_procedure[proc] = float(np.nanmean(group_ssim)) if nme_scores: group_nme = [nme_scores[i] for i in indices if i < len(nme_scores)] if group_nme: metrics.nme_by_procedure[proc] = float(np.nanmean(group_nme)) return metrics