Spaces:
Sleeping
Sleeping
| """ | |
| VocalGuard | |
| File uploads: inverted MelodyMachine labels (real=AI, fake=human) | |
| Mic input: normal labels (fake=AI, real=human) + conservative threshold | |
| """ | |
| import numpy as np | |
| import librosa, soundfile as sf | |
| import time, io, logging, warnings | |
| from typing import Dict, Any, Tuple, List | |
| warnings.filterwarnings("ignore") | |
| logger = logging.getLogger(__name__) | |
| try: | |
| import torch | |
| from transformers import AutoModelForAudioClassification, AutoFeatureExtractor | |
| TORCH_OK = True | |
| except ImportError: | |
| TORCH_OK = False | |
| class VocalGuardDetector: | |
| SR = 16000 | |
| MIN_DURATION = 0.5 | |
| def __init__(self): | |
| logger.info("VocalGuard v7.2 initializing...") | |
| self.local_model = None | |
| self.local_extractor = None | |
| self._try_load_local() | |
| logger.info("VocalGuard v7.2 ready.") | |
| def _try_load_local(self): | |
| if not TORCH_OK: | |
| logger.warning("torch not available") | |
| return | |
| try: | |
| model_id = "MelodyMachine/Deepfake-audio-detection-V2" | |
| self.local_extractor = AutoFeatureExtractor.from_pretrained(model_id) | |
| self.local_model = AutoModelForAudioClassification.from_pretrained(model_id) | |
| self.local_model.eval() | |
| logger.info(f"Model loaded: {self.local_model.config.id2label}") | |
| except Exception as e: | |
| logger.error(f"Model load failed: {e}") | |
| # ββ AUDIO LOADING βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _load(self, audio_bytes: bytes) -> np.ndarray: | |
| for fn in [ | |
| lambda b: sf.read(io.BytesIO(b), always_2d=False), | |
| lambda b: librosa.load(io.BytesIO(b), sr=None, mono=True), | |
| lambda b: (np.frombuffer(b, dtype=np.int16).astype(np.float32) / 32768.0, 16000), | |
| ]: | |
| try: | |
| y, sr = fn(audio_bytes) | |
| if hasattr(y, 'ndim') and y.ndim > 1: | |
| y = y.mean(axis=1) | |
| if len(y) > 100: | |
| if sr != self.SR: | |
| y = librosa.resample(y, orig_sr=sr, target_sr=self.SR) | |
| return y.astype(np.float32) | |
| except Exception: | |
| continue | |
| raise ValueError("Cannot decode audio") | |
| # ββ FILE UPLOAD INFERENCE βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _infer_file(self, y: np.ndarray) -> Tuple[float, str]: | |
| """ | |
| MelodyMachine inverted labels for file uploads (confirmed from testing): | |
| 'real' score = AI probability | |
| 'fake' score = human probability | |
| """ | |
| min_len = self.SR * 3 | |
| if len(y) < min_len: | |
| y = np.pad(y, (0, min_len - len(y))) | |
| inputs = self.local_extractor( | |
| y, sampling_rate=self.SR, | |
| return_tensors="pt", padding=True | |
| ) | |
| with torch.no_grad(): | |
| logits = self.local_model(**inputs).logits | |
| probs = torch.softmax(logits, dim=-1)[0].numpy() | |
| id2label = self.local_model.config.id2label | |
| logger.info(f"File probs: { {id2label[i]: round(float(probs[i]), 4) for i in range(len(probs))} }") | |
| # INVERTED labels for this model on file uploads | |
| ai_prob = float(probs[0]) | |
| for idx, lbl in id2label.items(): | |
| if "real" in lbl.lower(): | |
| ai_prob = float(probs[idx]) # real = AI | |
| break | |
| for idx, lbl in id2label.items(): | |
| if "fake" in lbl.lower(): | |
| ai_prob = float(1.0 - probs[idx]) # fake = human β invert | |
| break | |
| logger.info(f"File AI prob: {ai_prob:.4f}") | |
| return float(np.clip(ai_prob, 0.01, 0.99)), "model_file" | |
| # ββ MIC INFERENCE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _infer_mic(self, y: np.ndarray) -> Tuple[float, str]: | |
| """ | |
| Normal label logic for mic audio: | |
| 'fake' = AI, 'real' = human | |
| Conservative threshold applied to reduce false positives. | |
| """ | |
| from scipy import signal as scipy_signal | |
| # High-pass filter to remove room rumble | |
| sos = scipy_signal.butter(4, 80, 'hp', fs=self.SR, output='sos') | |
| y = scipy_signal.sosfilt(sos, y).astype(np.float32) | |
| # Normalize | |
| peak = np.max(np.abs(y)) | |
| if peak > 0.001: | |
| y /= peak | |
| # Pad to 4 seconds minimum | |
| min_len = self.SR * 4 | |
| if len(y) < min_len: | |
| y = np.pad(y, (0, min_len - len(y))) | |
| inputs = self.local_extractor( | |
| y, sampling_rate=self.SR, | |
| return_tensors="pt", padding=True | |
| ) | |
| with torch.no_grad(): | |
| logits = self.local_model(**inputs).logits | |
| probs = torch.softmax(logits, dim=-1)[0].numpy() | |
| id2label = self.local_model.config.id2label | |
| logger.info(f"Mic probs: { {id2label[i]: round(float(probs[i]), 4) for i in range(len(probs))} }") | |
| # NORMAL labels for mic | |
| ai_prob = float(probs[0]) | |
| for idx, lbl in id2label.items(): | |
| if "fake" in lbl.lower(): | |
| ai_prob = float(probs[idx]) | |
| break | |
| for idx, lbl in id2label.items(): | |
| if "real" in lbl.lower(): | |
| ai_prob = float(1.0 - probs[idx]) | |
| break | |
| # Conservative: compress uncertain results toward human | |
| # Only flag strong AI detections (>0.70) on mic | |
| if ai_prob < 0.70: | |
| ai_prob = ai_prob * 0.45 | |
| logger.info(f"Mic AI prob (after conservative threshold): {ai_prob:.4f}") | |
| return float(np.clip(ai_prob, 0.01, 0.99)), "model_mic" | |
| # ββ MAIN PREDICT ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def predict(self, audio_bytes: bytes, is_mic: bool = False) -> Dict[str, Any]: | |
| t0 = time.time() | |
| y = self._load(audio_bytes) | |
| y, _ = librosa.effects.trim(y, top_db=25) | |
| if len(y) < self.SR * self.MIN_DURATION: | |
| return self._err(t0, "Too short β speak for at least 1 second") | |
| peak = np.max(np.abs(y)) | |
| if peak < 0.003: | |
| return self._err(t0, "Signal too quiet β check microphone") | |
| y /= (peak + 1e-10) | |
| dur = len(y) / self.SR | |
| if self.local_model is None: | |
| return self._err(t0, "Model not loaded β check torch/transformers installation") | |
| logger.info(f"Source: {'mic' if is_mic else 'file'} | duration: {dur:.1f}s") | |
| try: | |
| if is_mic: | |
| ai_prob, method = self._infer_mic(y) | |
| else: | |
| ai_prob, method = self._infer_file(y) | |
| except Exception as e: | |
| logger.error(f"Inference error: {e}") | |
| return self._err(t0, f"Detection failed: {str(e)[:80]}") | |
| ai_prob = float(np.clip(ai_prob, 0.01, 0.99)) | |
| label = "AI Generated" if ai_prob >= 0.5 else "Human Voice" | |
| conf = ai_prob if ai_prob >= 0.5 else (1 - ai_prob) | |
| d = abs(ai_prob - 0.5) | |
| tier = "High" if d > 0.28 else ("Medium" if d > 0.13 else "Low") | |
| logger.info(f"Final β {label} ({conf*100:.1f}%) via {method}") | |
| return { | |
| "label": label, | |
| "confidence": round(conf * 100, 1), | |
| "confidence_tier": tier, | |
| "ai_probability": round(ai_prob, 4), | |
| "human_probability": round(1 - ai_prob, 4), | |
| "duration_seconds": round(dur, 2), | |
| "processing_ms": int((time.time() - t0) * 1000), | |
| "detection_method": method, | |
| "feature_scores": { | |
| "AI Probability": round(ai_prob, 4), | |
| "Human Probability": round(1 - ai_prob, 4), | |
| }, | |
| "key_indicators": self._indicators(ai_prob, method), | |
| } | |
| def predict_fast(self, audio_bytes: bytes, is_mic: bool = False) -> Dict[str, Any]: | |
| return self.predict(audio_bytes, is_mic=is_mic) | |
| def _err(self, t0, msg): | |
| return { | |
| "label": "unknown", "confidence": 0, | |
| "ai_probability": 0.5, "human_probability": 0.5, | |
| "processing_ms": int((time.time() - t0) * 1000), | |
| "warning": msg, "feature_scores": {}, "key_indicators": [] | |
| } | |
| def _indicators(self, ai_prob: float, method: str) -> List[str]: | |
| out = [] | |
| if method == "model_mic": | |
| out.append("ποΈ Live mic analysis β upload file for highest accuracy") | |
| else: | |
| out.append("π¬ ML model analysis on uploaded file") | |
| if ai_prob > 0.75: | |
| out.append("β οΈ Strong AI synthesis markers detected") | |
| elif ai_prob > 0.50: | |
| out.append("β οΈ Possible AI synthesis detected") | |
| elif ai_prob < 0.25: | |
| out.append("β Strong natural human speech markers") | |
| else: | |
| out.append("β Natural human speech markers present") | |
| return out |