LandmarkDiff / landmarkdiff /evaluation.py
dreamlessx's picture
Update landmarkdiff/evaluation.py to v0.3.2
db489aa verified
"""Evaluation metrics suite.
All metrics stratified by Fitzpatrick skin type (I-VI) using ITA-based thresholding.
Primary metrics: FID, LPIPS, NME, ArcFace identity similarity.
Secondary: SSIM (relaxed target >0.80).
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
import numpy as np
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
@dataclass
class EvalMetrics:
"""Computed evaluation metrics for a batch of generated images."""
fid: float = 0.0
lpips: float = 0.0
nme: float = 0.0 # Normalized Mean landmark Error
identity_sim: float = 0.0 # ArcFace cosine similarity
ssim: float = 0.0
# Per-Fitzpatrick breakdown (all metrics stratified)
fid_by_fitzpatrick: dict[str, float] = field(default_factory=dict)
nme_by_fitzpatrick: dict[str, float] = field(default_factory=dict)
lpips_by_fitzpatrick: dict[str, float] = field(default_factory=dict)
ssim_by_fitzpatrick: dict[str, float] = field(default_factory=dict)
identity_sim_by_fitzpatrick: dict[str, float] = field(default_factory=dict)
count_by_fitzpatrick: dict[str, int] = field(default_factory=dict)
# Per-procedure breakdown
nme_by_procedure: dict[str, float] = field(default_factory=dict)
lpips_by_procedure: dict[str, float] = field(default_factory=dict)
ssim_by_procedure: dict[str, float] = field(default_factory=dict)
def summary(self) -> str:
lines = [
f"FID: {self.fid:.2f}",
f"LPIPS: {self.lpips:.4f}",
f"NME: {self.nme:.4f}",
f"Identity Sim: {self.identity_sim:.4f}",
f"SSIM: {self.ssim:.4f}",
]
if self.count_by_fitzpatrick:
lines.append("\nBy Fitzpatrick Type:")
for ftype in sorted(self.count_by_fitzpatrick):
n = self.count_by_fitzpatrick[ftype]
parts = [f" Type {ftype} (n={n}):"]
if ftype in self.lpips_by_fitzpatrick:
parts.append(f"LPIPS={self.lpips_by_fitzpatrick[ftype]:.4f}")
if ftype in self.ssim_by_fitzpatrick:
parts.append(f"SSIM={self.ssim_by_fitzpatrick[ftype]:.4f}")
if ftype in self.nme_by_fitzpatrick:
parts.append(f"NME={self.nme_by_fitzpatrick[ftype]:.4f}")
if ftype in self.identity_sim_by_fitzpatrick:
parts.append(f"ID={self.identity_sim_by_fitzpatrick[ftype]:.4f}")
lines.append(" ".join(parts))
if self.fid_by_fitzpatrick:
lines.append("\nFID by Fitzpatrick:")
for k, v in sorted(self.fid_by_fitzpatrick.items()):
lines.append(f" Type {k}: {v:.2f}")
return "\n".join(lines)
def to_dict(self) -> dict:
"""Convert to flat dictionary for JSON/CSV export."""
d = {
"fid": self.fid,
"lpips": self.lpips,
"nme": self.nme,
"identity_sim": self.identity_sim,
"ssim": self.ssim,
}
for ftype in sorted(self.count_by_fitzpatrick):
prefix = f"fitz_{ftype}"
d[f"{prefix}_count"] = self.count_by_fitzpatrick.get(ftype, 0)
d[f"{prefix}_lpips"] = self.lpips_by_fitzpatrick.get(ftype, 0.0)
d[f"{prefix}_ssim"] = self.ssim_by_fitzpatrick.get(ftype, 0.0)
d[f"{prefix}_nme"] = self.nme_by_fitzpatrick.get(ftype, 0.0)
d[f"{prefix}_identity"] = self.identity_sim_by_fitzpatrick.get(ftype, 0.0)
for proc in sorted(self.nme_by_procedure):
d[f"proc_{proc}_nme"] = self.nme_by_procedure.get(proc, 0.0)
d[f"proc_{proc}_lpips"] = self.lpips_by_procedure.get(proc, 0.0)
d[f"proc_{proc}_ssim"] = self.ssim_by_procedure.get(proc, 0.0)
return d
def classify_fitzpatrick_ita(image: np.ndarray) -> str:
"""Classify Fitzpatrick skin type using Individual Typology Angle (ITA).
ITA = arctan((L - 50) / b) * (180 / pi)
where L, b are from CIE L*a*b* color space.
Thresholds from Chardon et al. (1991):
- ITA > 55: Type I (very light)
- 41 < ITA <= 55: Type II (light)
- 28 < ITA <= 41: Type III (intermediate)
- 10 < ITA <= 28: Type IV (tan)
- -30 < ITA <= 10: Type V (brown)
- ITA <= -30: Type VI (dark)
"""
if cv2 is None:
raise ImportError("opencv-python is required for Fitzpatrick classification")
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB).astype(np.float32)
# Sample from face center region (avoid background)
h, w = image.shape[:2]
center = lab[h // 4 : 3 * h // 4, w // 4 : 3 * w // 4]
L_mean = center[:, :, 0].mean() * 100 / 255 # scale to 0-100
b_mean = center[:, :, 2].mean() - 128 # center around 0
if abs(b_mean) < 1e-6:
b_mean = 1e-6
ita = np.arctan2(L_mean - 50, b_mean) * (180 / np.pi)
if ita > 55:
return "I"
elif ita > 41:
return "II"
elif ita > 28:
return "III"
elif ita > 10:
return "IV"
elif ita > -30:
return "V"
else:
return "VI"
def compute_nme(
pred_landmarks: np.ndarray,
target_landmarks: np.ndarray,
left_eye_idx: int = 33,
right_eye_idx: int = 263,
) -> float:
"""Compute Normalized Mean Error for landmarks.
Normalized by inter-ocular distance.
Args:
pred_landmarks: (N, 2) predicted landmark positions.
target_landmarks: (N, 2) ground truth positions.
left_eye_idx: MediaPipe index for left eye center.
right_eye_idx: MediaPipe index for right eye center.
Returns:
NME value (lower is better).
"""
iod = np.linalg.norm(
target_landmarks[left_eye_idx] - target_landmarks[right_eye_idx]
)
if iod < 1.0:
iod = 1.0
distances = np.linalg.norm(pred_landmarks - target_landmarks, axis=1)
return float(np.mean(distances) / iod)
def compute_ssim(
pred: np.ndarray,
target: np.ndarray,
) -> float:
"""Compute Structural Similarity Index (SSIM).
Uses scikit-image's windowed SSIM (Wang et al. 2004) for proper
per-window computation with 11x11 Gaussian kernel.
"""
try:
from skimage.metrics import structural_similarity
# Convert to grayscale if color, or compute per-channel
if pred.ndim == 3 and pred.shape[2] == 3:
return float(structural_similarity(pred, target, channel_axis=2, data_range=255))
else:
return float(structural_similarity(pred, target, data_range=255))
except ImportError:
# Fallback: simple global SSIM (not publication-quality)
pred_f = pred.astype(np.float64)
target_f = target.astype(np.float64)
mu_p = np.mean(pred_f)
mu_t = np.mean(target_f)
sigma_p = np.std(pred_f)
sigma_t = np.std(target_f)
sigma_pt = np.mean((pred_f - mu_p) * (target_f - mu_t))
C1 = (0.01 * 255) ** 2
C2 = (0.03 * 255) ** 2
ssim_val = (
(2 * mu_p * mu_t + C1) * (2 * sigma_pt + C2)
) / (
(mu_p ** 2 + mu_t ** 2 + C1) * (sigma_p ** 2 + sigma_t ** 2 + C2)
)
return float(ssim_val)
_LPIPS_FN = None
_ARCFACE_APP = None
def _get_lpips_fn() -> Any:
"""Get or create singleton LPIPS model."""
global _LPIPS_FN
if _LPIPS_FN is None:
import lpips
_LPIPS_FN = lpips.LPIPS(net="alex", verbose=False)
_LPIPS_FN.eval()
return _LPIPS_FN
def compute_lpips(
pred: np.ndarray,
target: np.ndarray,
) -> float:
"""Compute LPIPS perceptual distance between two images.
Returns LPIPS score (lower = more similar).
"""
try:
import lpips # noqa: F401 — availability check; used in _get_lpips_fn
import torch
except ImportError:
return float("nan")
_lpips_fn = _get_lpips_fn()
def _to_tensor(img: np.ndarray) -> torch.Tensor:
t = torch.from_numpy(img.astype(np.float32) / 255.0).permute(2, 0, 1).unsqueeze(0)
return t * 2 - 1 # LPIPS expects [-1, 1]
with torch.no_grad():
score = _lpips_fn(_to_tensor(pred), _to_tensor(target))
return float(score.item())
def compute_fid(
real_dir: str,
generated_dir: str,
) -> float:
"""Compute FID between directories of real and generated images.
Uses torch-fidelity for GPU-accelerated computation.
Args:
real_dir: Path to directory of real images.
generated_dir: Path to directory of generated images.
Returns:
FID score (lower = more similar distributions).
"""
try:
from torch_fidelity import calculate_metrics
except ImportError as e:
raise ImportError(
"torch-fidelity is required for FID. Install with: pip install torch-fidelity"
) from e
import torch
metrics = calculate_metrics(
input1=generated_dir,
input2=real_dir,
cuda=torch.cuda.is_available(),
fid=True,
verbose=False,
)
return float(metrics["frechet_inception_distance"])
def compute_identity_similarity(
pred: np.ndarray,
target: np.ndarray,
) -> float:
"""Compute ArcFace identity cosine similarity between two face images.
Returns cosine similarity [0, 1] where 1 = identical identity.
Falls back to SSIM-based proxy if InsightFace unavailable.
"""
try:
from insightface.app import FaceAnalysis
global _ARCFACE_APP
if _ARCFACE_APP is None:
_ARCFACE_APP = FaceAnalysis(
name="buffalo_l",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
_ARCFACE_APP.prepare(ctx_id=-1, det_size=(320, 320))
app = _ARCFACE_APP
pred_bgr = pred if pred.shape[2] == 3 else cv2.cvtColor(pred, cv2.COLOR_RGB2BGR)
target_bgr = target if target.shape[2] == 3 else cv2.cvtColor(target, cv2.COLOR_RGB2BGR)
pred_faces = app.get(pred_bgr)
target_faces = app.get(target_bgr)
if pred_faces and target_faces:
pred_emb = pred_faces[0].embedding
target_emb = target_faces[0].embedding
sim = np.dot(pred_emb, target_emb) / (
np.linalg.norm(pred_emb) * np.linalg.norm(target_emb) + 1e-8
)
return float(np.clip(sim, 0, 1))
except Exception:
pass
# Fallback: SSIM-based proxy
return compute_ssim(pred, target)
# ------------------------------------------------------------------
# Geometric nasal ratios (adapted from Varghaei et al., arXiv:2508.13363)
# ------------------------------------------------------------------
# MediaPipe 478-point indices for facial measurements
_LEFT_ALAR = 129 # left alar (nose wing) outermost point
_RIGHT_ALAR = 358 # right alar
_NOSE_TIP = 1 # pronasale
_NOSE_BRIDGE_TOP = 168 # nasion (bridge root)
_LEFT_INNER_CANTHUS = 133
_RIGHT_INNER_CANTHUS = 362
_LEFT_TRAGION = 234 # left ear (face width proxy)
_RIGHT_TRAGION = 454 # right ear
_FOREHEAD = 10 # trichion / upper face
_CHIN = 152 # menton / lowest chin point
def compute_nasal_ratios(
landmarks: np.ndarray,
) -> dict[str, float]:
"""Compute 5 nasal geometric ratios from MediaPipe 478-point landmarks.
Ratios from Varghaei et al. (2025), used clinically to assess
rhinoplasty outcomes. All ratios are dimensionless.
Args:
landmarks: (478, 2) or (478, 3) landmark pixel coordinates.
Returns:
Dict with keys: alar_face_ratio, nose_face_ratio,
alar_intercanthal_ratio, tip_deviation, nostril_asymmetry.
"""
pts = landmarks[:, :2] # use only x,y
alar_width = np.linalg.norm(pts[_LEFT_ALAR] - pts[_RIGHT_ALAR])
face_width = np.linalg.norm(pts[_LEFT_TRAGION] - pts[_RIGHT_TRAGION])
nose_length = np.linalg.norm(pts[_NOSE_BRIDGE_TOP] - pts[_NOSE_TIP])
face_height = np.linalg.norm(pts[_FOREHEAD] - pts[_CHIN])
intercanthal = np.linalg.norm(
pts[_LEFT_INNER_CANTHUS] - pts[_RIGHT_INNER_CANTHUS]
)
# Midline: midpoint between inner canthi
midline_x = (pts[_LEFT_INNER_CANTHUS][0] + pts[_RIGHT_INNER_CANTHUS][0]) / 2
tip_deviation = abs(pts[_NOSE_TIP][0] - midline_x) / (face_width + 1e-8)
# Nostril asymmetry: difference in left/right alar-to-tip distances
left_dist = np.linalg.norm(pts[_LEFT_ALAR] - pts[_NOSE_TIP])
right_dist = np.linalg.norm(pts[_RIGHT_ALAR] - pts[_NOSE_TIP])
nostril_asymmetry = abs(left_dist - right_dist) / (alar_width + 1e-8)
return {
"alar_face_ratio": float(alar_width / (face_width + 1e-8)),
"nose_face_ratio": float(nose_length / (face_height + 1e-8)),
"alar_intercanthal_ratio": float(alar_width / (intercanthal + 1e-8)),
"tip_deviation": float(tip_deviation),
"nostril_asymmetry": float(nostril_asymmetry),
}
def compute_bilateral_symmetry(
landmarks: np.ndarray,
) -> float:
"""Compute bilateral facial symmetry score from landmarks.
Reflects each left-side landmark across the vertical midline and
measures average displacement from the corresponding right-side point.
Normalized by inter-ocular distance.
Based on KDTree approach from Varghaei et al. (2025).
Args:
landmarks: (478, 2) or (478, 3) landmark pixel coordinates.
Returns:
Symmetry score in [0, 1] where 1 = perfect symmetry.
"""
pts = landmarks[:, :2]
# Midline from forehead to chin
midline_x = (pts[_LEFT_TRAGION][0] + pts[_RIGHT_TRAGION][0]) / 2
iod = np.linalg.norm(pts[33] - pts[263]) # inter-ocular distance
if iod < 1.0:
iod = 1.0
# MediaPipe left-right correspondence pairs (subset of reliable pairs)
# format: (left_idx, right_idx)
sym_pairs = [
(33, 263), # outer canthi
(133, 362), # inner canthi
(70, 300), # eyebrow inner
(105, 334), # eyebrow outer
(129, 358), # alar
(61, 291), # mouth corners
(234, 454), # tragion
(93, 323), # cheekbone
(132, 361), # lower eyelid
(159, 386), # upper eyelid
(58, 288), # lower lip
(172, 397), # chin lateral
(136, 365), # nose lateral
(48, 278), # nostril
]
diffs = []
for left_idx, right_idx in sym_pairs:
# Reflect left point across midline
reflected_x = 2 * midline_x - pts[left_idx][0]
reflected = np.array([reflected_x, pts[left_idx][1]])
diff = np.linalg.norm(reflected - pts[right_idx]) / iod
diffs.append(diff)
mean_asymmetry = np.mean(diffs)
# Convert to 0-1 symmetry score (asymmetry of 0 = score of 1)
return float(np.clip(1.0 - mean_asymmetry, 0.0, 1.0))
def evaluate_batch(
predictions: list[np.ndarray],
targets: list[np.ndarray],
pred_landmarks: list[np.ndarray] | None = None,
target_landmarks: list[np.ndarray] | None = None,
procedures: list[str] | None = None,
compute_identity: bool = False,
) -> EvalMetrics:
"""Evaluate a batch of predicted vs target images.
Computes all metrics and stratifies by Fitzpatrick skin type and procedure.
Args:
predictions: List of predicted BGR images.
targets: List of target BGR images.
pred_landmarks: Optional list of (N, 2) predicted landmark arrays.
target_landmarks: Optional list of (N, 2) target landmark arrays.
procedures: Optional list of procedure names for per-procedure breakdown.
compute_identity: Whether to compute ArcFace identity similarity (slow).
Returns:
EvalMetrics with all computed values.
"""
n = len(predictions)
ssim_scores = []
lpips_scores = []
nme_scores = []
identity_scores = []
fitz_groups: dict[str, list[int]] = {}
proc_groups: dict[str, list[int]] = {}
for i in range(n):
ssim_scores.append(compute_ssim(predictions[i], targets[i]))
lpips_scores.append(compute_lpips(predictions[i], targets[i]))
if pred_landmarks is not None and target_landmarks is not None:
nme_scores.append(compute_nme(pred_landmarks[i], target_landmarks[i]))
if compute_identity:
identity_scores.append(compute_identity_similarity(predictions[i], targets[i]))
# Fitzpatrick classification
if cv2 is not None:
try:
fitz = classify_fitzpatrick_ita(targets[i])
fitz_groups.setdefault(fitz, []).append(i)
except Exception:
pass
# Procedure grouping
if procedures is not None and i < len(procedures):
proc_groups.setdefault(procedures[i], []).append(i)
metrics = EvalMetrics(
ssim=float(np.nanmean(ssim_scores)) if ssim_scores else 0.0,
lpips=float(np.nanmean(lpips_scores)) if lpips_scores else 0.0,
nme=float(np.nanmean(nme_scores)) if nme_scores else 0.0,
identity_sim=float(np.nanmean(identity_scores)) if identity_scores else 0.0,
)
# Full Fitzpatrick stratification for ALL metrics
for ftype, indices in fitz_groups.items():
metrics.count_by_fitzpatrick[ftype] = len(indices)
group_lpips = [lpips_scores[i] for i in indices]
if group_lpips:
metrics.lpips_by_fitzpatrick[ftype] = float(np.nanmean(group_lpips))
group_ssim = [ssim_scores[i] for i in indices]
if group_ssim:
metrics.ssim_by_fitzpatrick[ftype] = float(np.nanmean(group_ssim))
if nme_scores:
group_nme = [nme_scores[i] for i in indices if i < len(nme_scores)]
if group_nme:
metrics.nme_by_fitzpatrick[ftype] = float(np.nanmean(group_nme))
if identity_scores:
group_id = [identity_scores[i] for i in indices if i < len(identity_scores)]
if group_id:
metrics.identity_sim_by_fitzpatrick[ftype] = float(np.nanmean(group_id))
# Per-procedure breakdown
for proc, indices in proc_groups.items():
group_lpips = [lpips_scores[i] for i in indices]
if group_lpips:
metrics.lpips_by_procedure[proc] = float(np.nanmean(group_lpips))
group_ssim = [ssim_scores[i] for i in indices]
if group_ssim:
metrics.ssim_by_procedure[proc] = float(np.nanmean(group_ssim))
if nme_scores:
group_nme = [nme_scores[i] for i in indices if i < len(nme_scores)]
if group_nme:
metrics.nme_by_procedure[proc] = float(np.nanmean(group_nme))
return metrics