| """Stochastic noise models for the biological simulator."""
|
|
|
| from __future__ import annotations
|
|
|
| from typing import Dict, List, Tuple
|
|
|
| import numpy as np
|
|
|
|
|
| class NoiseModel:
|
| """Generates calibrated noise for simulated experimental outputs.
|
|
|
| All randomness is funnelled through a single ``numpy.Generator``
|
| so that episodes are reproducible given the same seed.
|
| """
|
|
|
| def __init__(self, seed: int = 42):
|
| self.rng = np.random.default_rng(seed)
|
|
|
| def reseed(self, seed: int) -> None:
|
| self.rng = np.random.default_rng(seed)
|
|
|
|
|
|
|
| def add_expression_noise(
|
| self,
|
| true_values: Dict[str, float],
|
| noise_level: float,
|
| dropout_rate: float,
|
| ) -> Dict[str, float]:
|
| noisy: Dict[str, float] = {}
|
| for gene, value in true_values.items():
|
|
|
|
|
|
|
| p_drop = dropout_rate / (1.0 + abs(value))
|
| if self.rng.random() < p_drop:
|
| noisy[gene] = 0.0
|
| else:
|
| sigma = noise_level * abs(value) + 0.1
|
| noisy[gene] = float(value + self.rng.normal(0, sigma))
|
| return noisy
|
|
|
|
|
|
|
| def sample_effect_sizes(
|
| self,
|
| true_effects: Dict[str, float],
|
| sample_size: int,
|
| noise_level: float,
|
| ) -> Dict[str, float]:
|
| se = noise_level / max(np.sqrt(max(sample_size, 1)), 1e-6)
|
| return {
|
| gene: float(effect + self.rng.normal(0, se))
|
| for gene, effect in true_effects.items()
|
| }
|
|
|
| def sample_p_values(
|
| self,
|
| true_effects: Dict[str, float],
|
| sample_size: int,
|
| noise_level: float,
|
| ) -> Dict[str, float]:
|
| """Simulate approximate p-values from z-statistics."""
|
| from scipy import stats
|
|
|
| p_values: Dict[str, float] = {}
|
| se = noise_level / max(np.sqrt(max(sample_size, 1)), 1e-6)
|
| for gene, effect in true_effects.items():
|
| z = abs(effect) / max(se, 1e-8)
|
| p_values[gene] = float(2 * stats.norm.sf(z))
|
| return p_values
|
|
|
|
|
|
|
| def generate_false_positives(
|
| self, n_background_genes: int, fdr: float
|
| ) -> List[str]:
|
| n_fp = int(self.rng.binomial(n_background_genes, fdr))
|
| return [f"FP_GENE_{i}" for i in range(n_fp)]
|
|
|
| def generate_false_negatives(
|
| self, true_genes: List[str], fnr: float
|
| ) -> List[str]:
|
| """Return the subset of *true_genes* that are missed."""
|
| return [g for g in true_genes if self.rng.random() < fnr]
|
|
|
|
|
|
|
| def quality_degradation(
|
| self, base_quality: float, factors: List[float]
|
| ) -> float:
|
| q = base_quality
|
| for f in factors:
|
| q *= f
|
| return float(np.clip(q + self.rng.normal(0, 0.02), 0.0, 1.0))
|
|
|
| def sample_qc_metric(
|
| self, mean: float, std: float, clip_lo: float = 0.0, clip_hi: float = 1.0
|
| ) -> float:
|
| return float(np.clip(self.rng.normal(mean, std), clip_lo, clip_hi))
|
|
|
| def sample_count(self, lam: float) -> int:
|
| return int(self.rng.poisson(max(lam, 0)))
|
|
|
| def coin_flip(self, p: float) -> bool:
|
| return bool(self.rng.random() < p)
|
|
|
| def sample_cluster_count(
|
| self, n_true_populations: int, quality: float
|
| ) -> int:
|
| """Over- or under-clustering depending on preprocessing quality."""
|
| delta = self.rng.integers(-2, 3)
|
| noise_clusters = max(0, int(round((1.0 - quality) * 3)))
|
| return max(1, n_true_populations + delta + noise_clusters)
|
|
|
| def shuffle_ranking(
|
| self, items: List[str], noise_level: float
|
| ) -> List[str]:
|
| """Permute a ranking with Gaussian noise on ordinals."""
|
| n = len(items)
|
| if n == 0:
|
| return []
|
| scores = np.arange(n, dtype=float) + self.rng.normal(
|
| 0, noise_level * n, size=n
|
| )
|
| order = np.argsort(scores)
|
| return [items[int(i)] for i in order]
|
|
|