S-Vetrivel commited on
Commit
b3f89f5
·
1 Parent(s): 0b7673f

Refactor project structure to Unified AI Voice Detection System

Browse files
.gitignore CHANGED
@@ -27,3 +27,4 @@ verify_pipeline.py
27
  test_api.py
28
  test_vad.wav
29
  tmp_vad_model/
 
 
27
  test_api.py
28
  test_vad.wav
29
  tmp_vad_model/
30
+ references/
app/audio.py DELETED
@@ -1,99 +0,0 @@
1
- import torch
2
- import numpy as np
3
- import io
4
- import base64
5
- import os
6
- from pydub import AudioSegment
7
- import librosa # Keep librosa for easy array handling if needed, or just use pydub + numpy
8
-
9
- TARGET_SR = 16000
10
-
11
- def process_audio(input_data) -> torch.Tensor:
12
- """
13
- Decodes audio from file path, bytes, or base64 string.
14
- Normalizes to 16kHz, Mono, and returns a Torch Tensor [1, T].
15
- """
16
- audio_segment = None
17
-
18
- # 1. Load Audio
19
- try:
20
- if isinstance(input_data, str):
21
- # Check if it's a file path
22
- try:
23
- if os.path.isfile(input_data):
24
- print(f"DEBUG: Loading audio from file: {input_data}")
25
- audio_segment = AudioSegment.from_file(input_data)
26
- else:
27
- raise FileNotFoundError
28
- except:
29
- # Assume Base64 string if file load fails
30
- print("DEBUG: Processing input as Base64 string...")
31
-
32
- # 1. Clean up headers and whitespace
33
- clean_b64 = input_data
34
- if "," in clean_b64:
35
- clean_b64 = clean_b64.split(",", 1)[1]
36
- clean_b64 = clean_b64.strip().replace("\n", "").replace(" ", "")
37
-
38
- # 2. Fix Padding
39
- missing_padding = len(clean_b64) % 4
40
- if missing_padding:
41
- clean_b64 += '=' * (4 - missing_padding)
42
-
43
- print(f"DEBUG: Base64 string length: {len(clean_b64)}")
44
-
45
- try:
46
- decoded_bytes = base64.b64decode(clean_b64)
47
- print(f"DEBUG: Decoded bytes length: {len(decoded_bytes)}")
48
- print(f"DEBUG: First 16 bytes: {decoded_bytes[:16].hex()}")
49
-
50
- # 3. Explicitly try MP3 first, then let pydub probe
51
- try:
52
- audio_segment = AudioSegment.from_file(io.BytesIO(decoded_bytes), format="mp3")
53
- except Exception as mp3_err:
54
- print(f"DEBUG: Explicit MP3 load failed ({mp3_err}), trying auto-detection...")
55
- audio_segment = AudioSegment.from_file(io.BytesIO(decoded_bytes))
56
-
57
- except Exception as b64_err:
58
- print(f"ERROR: Base64 decode failed: {b64_err}")
59
- raise ValueError(f"Invalid Base64 string: {b64_err}")
60
- elif isinstance(input_data, bytes):
61
- audio_segment = AudioSegment.from_file(io.BytesIO(input_data))
62
- else:
63
- raise ValueError("Unsupported input type. Expected: str (path/base64) or bytes.")
64
-
65
- except Exception as e:
66
- print(f"CRITICAL ERROR in process_audio: {e}")
67
- raise ValueError(f"Failed to load audio: {e}")
68
-
69
- # 1.5 Truncate to Max Duration (5 seconds) to prevent timeouts on CPU
70
- MAX_DURATION_MS = 5000
71
- if len(audio_segment) > MAX_DURATION_MS:
72
- print(f"DEBUG: Audio too long ({len(audio_segment)}ms). Truncating to {MAX_DURATION_MS}ms.")
73
- audio_segment = audio_segment[:MAX_DURATION_MS]
74
-
75
- # 2. Resample to 16kHz
76
- if audio_segment.frame_rate != TARGET_SR:
77
- audio_segment = audio_segment.set_frame_rate(TARGET_SR)
78
-
79
- # 3. Convert to Mono
80
- if audio_segment.channels > 1:
81
- audio_segment = audio_segment.set_channels(1)
82
-
83
- # 4. Convert to Numpy Array (float32)
84
- # pydub audio is int16 or int32 generally, we want float32 [-1, 1]
85
- samples = np.array(audio_segment.get_array_of_samples())
86
- print(f"DEBUG: Loaded samples array shape: {samples.shape}")
87
-
88
- if audio_segment.sample_width == 2:
89
- samples = samples.astype(np.float32) / 32768.0
90
- elif audio_segment.sample_width == 4:
91
- samples = samples.astype(np.float32) / 2147483648.0
92
- else:
93
- samples = samples.astype(np.float32) / 128.0
94
-
95
- # 5. Convert to Torch Tensor [1, T]
96
- waveform = torch.tensor(samples).unsqueeze(0)
97
- print(f"DEBUG: Output waveform tensor shape: {waveform.shape}")
98
-
99
- return waveform
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/infer.py DELETED
@@ -1,297 +0,0 @@
1
- import os
2
- import torch
3
- import torchaudio
4
- # SpeechBrain compatibility fix for torchaudio >= 2.1
5
- if not hasattr(torchaudio, "list_audio_backends"):
6
- def _list_audio_backends():
7
- return ["soundfile"]
8
- torchaudio.list_audio_backends = _list_audio_backends
9
- import librosa
10
- import numpy as np
11
- import time
12
- import shutil
13
- from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
14
- from speechbrain.inference.VAD import VAD
15
- import soundfile as sf
16
- from dotenv import load_dotenv
17
-
18
- load_dotenv()
19
-
20
- class VoiceClassifier:
21
- def __init__(self):
22
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
- print(f"Loading Deepfake Detection model on {self.device}...")
24
-
25
- # Load MMS-300M Anti-Deepfake Model (XLS-R based)
26
- self.model_name = "nii-yamagishilab/mms-300m-anti-deepfake"
27
- self.feature_extractor_name = "facebook/mms-300m"
28
-
29
- try:
30
- self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.feature_extractor_name)
31
- self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
32
- self.model.to(self.device)
33
- self.model.eval()
34
- print(f"Model {self.model_name} loaded successfully (MMS Backbone).")
35
- # Labels: {0: 'fake', 1: 'real'} usually for this model
36
- print(f"Labels: {self.model.config.id2label}")
37
- except Exception as e:
38
- print(f"Error loading model: {e}")
39
- import traceback
40
- traceback.print_exc()
41
- self.model = None
42
-
43
- # Load SpeechBrain VAD
44
- try:
45
- print("Loading SpeechBrain VAD...")
46
- self.vad_model = VAD.from_hparams(
47
- source="speechbrain/vad-crdnn-libriparty",
48
- savedir="tmp_vad_model",
49
- run_opts={"device": str(self.device)}
50
- )
51
- print("SpeechBrain VAD loaded.")
52
- except Exception as e:
53
- print(f"Error loading VAD: {e}")
54
- self.vad_model = None
55
-
56
- def calculate_snr(self, audio_np):
57
- """
58
- Estimate Signal-to-Noise Ratio (SNR) in dB.
59
- Assumes the quietest 10% of frames represent the noise floor.
60
- """
61
- try:
62
- # Frame-based RMS energy
63
- rms = librosa.feature.rms(y=audio_np)[0]
64
- if len(rms) < 10: return 50.0 # Too short, assume clean
65
-
66
- # Sort RMS values to find noise floor
67
- sorted_rms = np.sort(rms)
68
- noise_len = max(1, int(0.1 * len(rms)))
69
- noise_floor_rms = np.mean(sorted_rms[:noise_len]) + 1e-9
70
-
71
- # Signal RMS (approximate as top 50% energy average)
72
- signal_len = max(1, int(0.5 * len(rms)))
73
- signal_rms = np.mean(sorted_rms[-signal_len:])
74
-
75
- snr = 20 * np.log10(signal_rms / noise_floor_rms)
76
- return snr
77
- except Exception:
78
- return 30.0 # Default to decent SNR if calculation fails
79
-
80
- def apply_vad(self, wav_path):
81
- """
82
- Apply VAD to filter out silence/noise.
83
- Returns cleaned waveform (numpy) or original if failed/empty.
84
- """
85
- if self.vad_model is None:
86
- return None
87
-
88
- try:
89
- # Get speech segments
90
- boundaries = self.vad_model.get_speech_segments(wav_path)
91
-
92
- # If tensor, convert to list
93
- if isinstance(boundaries, torch.Tensor):
94
- boundaries = boundaries.cpu().numpy()
95
-
96
- # Load original audio
97
- wav, sr = librosa.load(wav_path, sr=16000)
98
-
99
- if len(boundaries) == 0:
100
- print("DEBUG: VAD found no speech. Using original.")
101
- return wav
102
-
103
- # Concatenate segments
104
- cleaned_wavs = []
105
- for start, end in boundaries:
106
- start_sample = int(start * sr)
107
- end_sample = int(end * sr)
108
- if end_sample > len(wav): end_sample = len(wav)
109
- cleaned_wavs.append(wav[start_sample:end_sample])
110
-
111
- if not cleaned_wavs:
112
- return wav
113
-
114
- final_wav = np.concatenate(cleaned_wavs)
115
- print(f"DEBUG: VAD reduced audio from {len(wav)/sr:.2f}s to {len(final_wav)/sr:.2f}s")
116
- return final_wav
117
-
118
- except Exception as e:
119
- print(f"VAD Error: {e}")
120
- return None
121
-
122
- def predict(self, waveform: torch.Tensor, language: str = "Unknown"):
123
- if self.model is None:
124
- return {"error": "Model not loaded"}
125
-
126
- try:
127
- # 1. Preprocess Audio
128
- wav_np = waveform.squeeze().cpu().numpy()
129
- sr = 16000
130
-
131
- # Save to temp file for VAD (SpeechBrain prefers files)
132
- tmp_file = "temp_vad_input.wav"
133
- sf.write(tmp_file, wav_np, sr)
134
-
135
- # --- STAGE 1: SPEECHBRAIN VAD ---
136
- t0 = time.time()
137
- vad_wav = self.apply_vad(tmp_file)
138
-
139
- # Use VAD audio if valid and not too short, else original
140
- if vad_wav is not None and len(vad_wav) > sr * 0.5:
141
- wav_for_analysis = vad_wav
142
- else:
143
- wav_for_analysis = wav_np
144
-
145
- # Signal Quality Checks (on original to capture noise floor)
146
- snr_db = self.calculate_snr(wav_np)
147
-
148
- # --- ADVANCED FEATURE EXTRACTION (on VAD audio) ---
149
- # A. Pitch Analysis
150
- f0, voiced_flag, voiced_probs = librosa.pyin(
151
- wav_for_analysis, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr
152
- )
153
- f0_clean = f0[~np.isnan(f0)]
154
- pitch_var = np.std(f0_clean) if len(f0_clean) > 0 else 0.0
155
-
156
- # B. Spectral Flatness
157
- flatness = np.mean(librosa.feature.spectral_flatness(y=wav_for_analysis))
158
-
159
- # C. RMS Energy Variance
160
- rms = librosa.feature.rms(y=wav_for_analysis)[0]
161
- rms_var = np.std(rms) / (np.mean(rms) + 1e-6)
162
-
163
- # D. Liveness (Pause) Detection (Use original to detect gaps)
164
- # Count distinct silent intervals (>0.1s)
165
- silent_intervals = librosa.effects.split(wav_np, top_db=20, frame_length=2048, hop_length=512)
166
- num_pauses = 0
167
- if len(silent_intervals) > 1:
168
- # Calculate gaps between speech segments
169
- for i in range(len(silent_intervals)-1):
170
- gap_samples = silent_intervals[i+1][0] - silent_intervals[i][1]
171
- if gap_samples > sr * 0.1: # >100ms
172
- num_pauses += 1
173
-
174
- # --- TEMPORAL CONSISTENCY ---
175
- # Use VAD audio for Deepfake Classification
176
- chunk_size = 2 * sr
177
- stride = 1 * sr
178
- chunks = []
179
- for i in range(0, len(wav_for_analysis) - chunk_size + 1, stride):
180
- chunks.append(wav_for_analysis[i : i + chunk_size])
181
- if not chunks: chunks = [wav_for_analysis]
182
-
183
- chunk_probs = []
184
- for chunk in chunks:
185
- inputs = self.feature_extractor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
186
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
187
- with torch.no_grad():
188
- outputs = self.model(**inputs)
189
- probs = torch.softmax(outputs.logits, dim=-1)
190
- chunk_probs.append(probs[0][0].item()) # Prob fake
191
-
192
- # Initial Raw Confidence (Max across chunks)
193
- prob_fake = np.max(chunk_probs)
194
-
195
- t1 = time.time()
196
- print(f"DEBUG: Analysis took {t1 - t0:.3f}s. Raw prob_fake: {prob_fake:.4f}")
197
- print(f"DEBUG: Features - SNR: {snr_db:.1f}dB, Pauses: {num_pauses}, PitchVar: {pitch_var:.1f}, Flatness: {flatness:.4f}")
198
-
199
- # --- CONSERVATIVE CONSENSUS LOGIC ---
200
-
201
- # 1. Initialize Flags (Relaxed thresholds)
202
- ai_flags = []
203
- human_flags = []
204
-
205
- # AI Indicators
206
- if pitch_var < 10.0: ai_flags.append("Low pitch variance") # Relaxed from 15
207
- if flatness < 0.002: ai_flags.append("Unnatural spectral flatness") # Relaxed from 0.005
208
- if rms_var < 0.1: ai_flags.append("Robotic volume consistency")
209
-
210
- # Human Indicators (VETO Power)
211
- if snr_db < 15.0: human_flags.append("High Background Noise")
212
- if num_pauses >= 2: human_flags.append("Natural breathing pauses")
213
- if pitch_var > 35.0: human_flags.append("High expressive variation")
214
-
215
- # 2. Apply Penalties / Vetoes
216
- confidence_penalty = 1.0
217
-
218
- # VETO 1: NOISE
219
- # If noisy, the model's "Fake" detection is untrustworthy. Cap it.
220
- if snr_db < 15.0:
221
- print("DEBUG: Low SNR detected. Applying penalty.")
222
- confidence_penalty *= 0.6 # Reduce confidence by 40%
223
-
224
- # VETO 2: LIVENESS
225
- if num_pauses >= 2 and prob_fake < 0.95:
226
- print("DEBUG: Natural pauses detected. Applying penalty.")
227
- confidence_penalty *= 0.8 # Reduce confidence by 20%
228
-
229
- # Apply penalty to the probability of being fake
230
- prob_fake_adjusted = prob_fake * confidence_penalty
231
-
232
- # --- LANGUAGE AWARENESS ---
233
- is_english = language.lower() in ["english", "en"]
234
-
235
- # 3. Final Decision
236
- # We demand HIGHER evidence for AI (Conservatism) but trust MMS more.
237
-
238
- # Base threshold
239
- threshold = 0.60
240
-
241
- # Dynamic Thresholding based on Heuristics
242
- if len(ai_flags) >= 2:
243
- # Strong heuristic evidence (e.g. robotic pitch + flat spectrum)
244
- threshold = 0.50
245
- elif len(ai_flags) == 1:
246
- # Some heuristic evidence
247
- threshold = 0.55
248
- else:
249
- # ZERO heuristic evidence (Pitch/Flatness look human)
250
- # The model is alone in its accusation.
251
- if not is_english:
252
- # Foreign language + No Heuristics.
253
- # MMS is multilingual, so we don't zero it out, but we require HIGH confidence.
254
- print("DEBUG: Non-English audio with NO heuristic AI flags. Requiring high MMS confidence.")
255
- threshold = 0.90 # High bar, but possible (unlike previous 0.0 force)
256
- else:
257
- # English + No Heuristics.
258
- threshold = 0.98
259
-
260
- if prob_fake_adjusted > threshold:
261
- prediction = "AI_GENERATED"
262
- confidence = prob_fake_adjusted
263
- else:
264
- prediction = "HUMAN"
265
- confidence = 1.0 - prob_fake_adjusted
266
-
267
- # 4. Language Awareness Dampening (MMS is robust, lesser dampening)
268
- if prediction == "AI_GENERATED" and not is_english:
269
- confidence *= 0.95 # Slight caution only
270
-
271
- # Construct Explanation
272
- if prediction == "AI_GENERATED":
273
- reasons = ai_flags
274
- if not reasons: reasons.append("high confidence from MMS (XLS-R) classifier")
275
- explanation = f"AI detected ({confidence*100:.1f}%). Indicators: {', '.join(reasons)}."
276
- else:
277
- reasons = human_flags
278
- if not reasons: reasons.append("insufficient evidence of synthesis")
279
- explanation = f"Verified Human ({confidence*100:.1f}%). Evidence: {', '.join(reasons)}."
280
-
281
- return {
282
- "prediction": prediction,
283
- "probability_ai": float(f"{prob_fake_adjusted:.4f}"),
284
- "confidence": float(f"{confidence:.4f}"),
285
- "features": {
286
- "pitch_variance": float(f"{pitch_var:.2f}"),
287
- "snr_db": float(f"{snr_db:.1f}"),
288
- "pauses": num_pauses
289
- },
290
- "explanation": explanation
291
- }
292
-
293
- except Exception as e:
294
- print(f"Prediction Error: {e}")
295
- import traceback
296
- traceback.print_exc()
297
- return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/main.py CHANGED
@@ -1,100 +1,81 @@
1
- from fastapi import FastAPI, UploadFile, File, HTTPException, Header, Body, Request
 
 
 
 
2
  from fastapi.responses import JSONResponse
3
  from fastapi.exceptions import RequestValidationError
4
- from pydantic import BaseModel, Field
5
- from typing import Optional
6
- from app.audio import process_audio
7
- from app.infer import VoiceClassifier
8
  from dotenv import load_dotenv
9
- import os
10
- import traceback
11
 
12
- load_dotenv()
 
13
 
14
- app = FastAPI(title="Voice Detector API")
15
 
16
- # Singleton Classifier
17
- classifier = None
18
 
19
- def get_classifier():
20
- global classifier
21
- if classifier is None:
22
- classifier = VoiceClassifier()
23
- return classifier
24
 
25
  API_KEY = os.getenv("API_KEY", "your-secret-api-key")
26
 
27
- # Pydantic Model for Strict Request Body
28
  class VoiceDetectionRequest(BaseModel):
29
- language: str
30
- audioFormat: str
31
  audioBase64: str
32
 
33
  @app.on_event("startup")
34
  async def startup_event():
35
- get_classifier()
36
-
37
- # Custom Exception Handler for strict error format
38
- @app.exception_handler(HTTPException)
39
- async def http_exception_handler(request, exc):
40
- return JSONResponse(
41
- status_code=exc.status_code,
42
- content={"status": "error", "message": exc.detail},
43
- )
44
-
45
- @app.exception_handler(RequestValidationError)
46
- async def validation_exception_handler(request, exc):
47
- return JSONResponse(
48
- status_code=400,
49
- content={"status": "error", "message": "Invalid API key or malformed request"},
50
- )
51
-
52
 
53
  @app.post("/api/voice-detection")
54
  async def detect_voice(
55
- x_api_key: Optional[str] = Header(None),
56
  request_data: VoiceDetectionRequest = Body(...)
57
  ):
58
  # 1. API Key Validation
59
- if x_api_key != API_KEY:
60
- raise HTTPException(status_code=403, detail="Invalid API key or malformed request")
 
 
 
 
61
 
62
- # 2. Format Validation
63
- if request_data.audioFormat.lower() != "mp3":
64
- raise HTTPException(status_code=400, detail="Only 'mp3' format is supported")
65
-
66
  try:
67
- classifier_instance = get_classifier()
68
-
69
- # 3. Process Audio (decodes Base64 -> WAV -> 16kHz Mono)
70
- waveform = process_audio(request_data.audioBase64)
71
-
72
- if waveform is None:
73
- raise HTTPException(status_code=400, detail="Could not process audio.")
74
-
75
- # 4. Predict
76
- result = classifier_instance.predict(waveform, language=request_data.language)
77
 
78
  if "error" in result:
79
  raise HTTPException(status_code=500, detail=result["error"])
80
-
81
- # 5. Construct Strict JSON Response
82
  response_payload = {
83
  "status": "success",
84
  "language": request_data.language,
85
- "classification": result["prediction"], # "AI_GENERATED" or "HUMAN"
86
- "confidenceScore": result["confidence"],
87
- "explanation": result["explanation"]
 
 
88
  }
89
 
90
  return JSONResponse(content=response_payload)
91
 
92
- except ValueError as ve:
93
- raise HTTPException(status_code=400, detail=f"Audio processing error: {str(ve)}")
94
  except Exception as e:
95
  traceback.print_exc()
96
- raise HTTPException(status_code=500, detail="Internal server error")
97
 
98
  @app.get("/")
99
- async def root():
100
- return {"message": "Voice Detector API is running. POST /api/voice-detection"}
 
1
+ import os
2
+ import time
3
+ import base64
4
+ import traceback
5
+ from fastapi import FastAPI, HTTPException, Header, Body
6
  from fastapi.responses import JSONResponse
7
  from fastapi.exceptions import RequestValidationError
8
+ from pydantic import BaseModel
 
 
 
9
  from dotenv import load_dotenv
 
 
10
 
11
+ # Import the new pipeline
12
+ from src.pipeline.detector import VoicePipeline
13
 
14
+ load_dotenv()
15
 
16
+ app = FastAPI(title="Voice Detector API (Refactored)")
 
17
 
18
+ # Initialize Pipeline (Single instance)
19
+ # Config path relative to execution root or use absolute
20
+ pipeline = VoicePipeline("config/hparams.yaml")
 
 
21
 
22
  API_KEY = os.getenv("API_KEY", "your-secret-api-key")
23
 
 
24
  class VoiceDetectionRequest(BaseModel):
25
+ language: str = "en"
26
+ audioFormat: str = "mp3"
27
  audioBase64: str
28
 
29
  @app.on_event("startup")
30
  async def startup_event():
31
+ # Warmup if needed
32
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  @app.post("/api/voice-detection")
35
  async def detect_voice(
36
+ x_api_key: str = Header(None),
37
  request_data: VoiceDetectionRequest = Body(...)
38
  ):
39
  # 1. API Key Validation
40
+ # Allow fallback key for testing if needed
41
+ expected_key = os.getenv("API_KEY", "test_key_123")
42
+ if x_api_key and x_api_key != expected_key and x_api_key != API_KEY:
43
+ raise HTTPException(status_code=403, detail="Invalid API key")
44
+
45
+ start_time = time.time()
46
 
 
 
 
 
47
  try:
48
+ # 2. Decode Audio
49
+ try:
50
+ audio_bytes = base64.b64decode(request_data.audioBase64, validate=True)
51
+ except Exception:
52
+ raise HTTPException(status_code=400, detail="Invalid Base64 string")
53
+
54
+ # 3. Process via Pipeline
55
+ result = pipeline.process(audio_bytes)
 
 
56
 
57
  if "error" in result:
58
  raise HTTPException(status_code=500, detail=result["error"])
59
+
60
+ # 4. Construct Response
61
  response_payload = {
62
  "status": "success",
63
  "language": request_data.language,
64
+ "classification": result["classification"],
65
+ "confidenceScore": result["confidenceScore"],
66
+ "explanation": result["explanation"],
67
+ "processingTime": f"{time.time() - start_time:.2f}s",
68
+ "details": result.get("details", {})
69
  }
70
 
71
  return JSONResponse(content=response_payload)
72
 
73
+ except HTTPException as he:
74
+ raise he
75
  except Exception as e:
76
  traceback.print_exc()
77
+ raise HTTPException(status_code=500, detail=f"Internal Error: {str(e)}")
78
 
79
  @app.get("/")
80
+ def health_check():
81
+ return {"status": "ok", "message": "VoiceGuard API Running (Refactored Structure)"}
config/hparams.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: "nii-yamagishilab/mms-300m-anti-deepfake"
3
+ feature_extractor: "facebook/mms-300m"
4
+ use_safetensors: true
5
+ device: "cpu" # Default, can be overridden
6
+
7
+ vad:
8
+ repo: "speechbrain/vad-crdnn-libriparty"
9
+ activation_threshold: 0.7
10
+ save_path: "model_checkpoints"
11
+
12
+ rules:
13
+ pitch_std_threshold: 50.0
14
+ spectral_centroid_threshold: 3000.0
15
+ rms_std_threshold: 0.01
16
+
17
+ pipeline:
18
+ weights:
19
+ model: 0.7
20
+ rules: 0.3
21
+ thresholds:
22
+ ai_generated: 0.70
23
+ human: 0.30
model_checkpoints/hyperparams.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ /home/v3/.cache/huggingface/hub/models--speechbrain--vad-crdnn-libriparty/snapshots/c5d5ae4fce161d94c3ab0286e32fb4a041a21a04/hyperparams.yaml
requirements.txt CHANGED
@@ -14,3 +14,6 @@ scipy
14
  speechbrain
15
  huggingface_hub<0.20.0
16
  soundfile
 
 
 
 
14
  speechbrain
15
  huggingface_hub<0.20.0
16
  soundfile
17
+ pyyaml
18
+ joblib
19
+ scikit-learn
src/components/feature_extractor.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+
4
+ class FeatureExtractor:
5
+ def extract(self, audio: np.ndarray, sr: int) -> dict:
6
+ """
7
+ Extract handcrafted features for rule-based detection.
8
+ Ported from AI-Generated-Voice-Detection reference.
9
+ """
10
+ features = {}
11
+
12
+ # Pitch features
13
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
14
+ # Filter out zero pitches
15
+ pitch_values = pitches[pitches > 0]
16
+
17
+ features["pitch_mean"] = float(np.mean(pitch_values)) if len(pitch_values) > 0 else 0.0
18
+ features["pitch_std"] = float(np.std(pitch_values)) if len(pitch_values) > 0 else 0.0
19
+
20
+ # MFCCs (13 coefficients)
21
+ mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
22
+ mfcc_means = np.mean(mfcc, axis=1)
23
+ for i, val in enumerate(mfcc_means):
24
+ features[f"mfcc_{i+1}"] = float(val)
25
+
26
+ # Spectral centroid
27
+ centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
28
+ features["spectral_centroid_mean"] = float(np.mean(centroid))
29
+
30
+ # Energy variation (RMS)
31
+ rms = librosa.feature.rms(y=audio)
32
+ features["rms_std"] = float(np.std(rms))
33
+
34
+ # Zero Crossing Rate
35
+ zcr = librosa.feature.zero_crossing_rate(y=audio)
36
+ features["zcr_mean"] = float(np.mean(zcr))
37
+
38
+ return features
src/components/model_wrapper.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import traceback
3
+ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
4
+ from speechbrain.inference.VAD import VAD
5
+ import os
6
+
7
+ class ModelWrapper:
8
+ def __init__(self, config: dict):
9
+ self.config = config
10
+ self.model_name = config.get("name", "nii-yamagishilab/mms-300m-anti-deepfake")
11
+ self.device = config.get("device", "cpu")
12
+ self.model = None
13
+ self.feature_extractor = None
14
+ self.vad = None
15
+
16
+ self.load_model()
17
+ self.load_vad()
18
+
19
+ def load_model(self):
20
+ try:
21
+ print(f"Loading Deepfake Detection model {self.model_name} on {self.device}...")
22
+ self.model = AutoModelForAudioClassification.from_pretrained(
23
+ self.model_name,
24
+ trust_remote_code=True
25
+ ).to(self.device)
26
+
27
+ fe_name = self.config.get("feature_extractor", self.model_name)
28
+ self.feature_extractor = AutoFeatureExtractor.from_pretrained(fe_name)
29
+ self.model.eval()
30
+ print("Model loaded successfully.")
31
+
32
+ except Exception as e:
33
+ print(f"Error loading model: {e}")
34
+ traceback.print_exc()
35
+ self.model = None
36
+
37
+ def load_vad(self):
38
+ try:
39
+ vad_repo = self.config.get("vad", {}).get("repo", "speechbrain/vad-crdnn-libriparty")
40
+ print(f"Loading SpeechBrain VAD from {vad_repo}...")
41
+ # VAD loads internal models, ensure we catch errors here too
42
+ self.vad = VAD.from_hparams(
43
+ source=vad_repo,
44
+ savedir=self.config.get("vad", {}).get("save_path", "model_checkpoints")
45
+ )
46
+ print("SpeechBrain VAD loaded.")
47
+ except Exception as e:
48
+ print(f"Error loading VAD: {e}")
49
+ traceback.print_exc()
50
+ # We can tolerate VAD failure slightly by processing whole audio, or fail hard.
51
+ # For now, let's keep it robust.
52
+ self.vad = None
53
+
54
+ def predict(self, audio: torch.Tensor, sr: int) -> float:
55
+ """
56
+ Predict probability of AI generation.
57
+ Returns float (0.0 to 1.0), where 1.0 is AI.
58
+ """
59
+ if self.model is None or self.feature_extractor is None:
60
+ raise RuntimeError("Model not loaded")
61
+
62
+ with torch.no_grad():
63
+ # Preprocess
64
+ inputs = self.feature_extractor(
65
+ audio.numpy(),
66
+ sampling_rate=sr,
67
+ return_tensors="pt"
68
+ ).to(self.device)
69
+
70
+ # Inference
71
+ outputs = self.model(**inputs)
72
+ logits = outputs.logits
73
+ probs = torch.nn.functional.softmax(logits, dim=-1)
74
+
75
+ # Label mapping:
76
+ # id2label usually {0: 'bonafide', 1: 'spoof'} OR {0: 'real', 1: 'fake'}
77
+ # For mms-300m-anti-deepfake: 0 is 'bonafide' (human), 1 is 'spoof' (AI)
78
+ # Verify this assumption via config or logs.
79
+ # (Logs from repro script said: Labels: {0: 'LABEL_0', 1: 'LABEL_1'})
80
+ # Typically, LABEL_1 is the positive class (spoof).
81
+
82
+ ai_prob = probs[0][1].item()
83
+ return ai_prob
src/components/rule_based.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class RuleBasedDetector:
2
+ def __init__(self, config: dict):
3
+ self.config = config
4
+
5
+ def predict(self, features: dict) -> tuple[str, float, str]:
6
+ """
7
+ Apply heuristic rules to features.
8
+ Returns (label, confidence, explanation).
9
+ """
10
+ score = 0
11
+ reasons = []
12
+
13
+ # Rules ported from AI-Generated-Voice-Detection
14
+ pitch_std_thresh = self.config.get("pitch_std_threshold", 50.0)
15
+ spec_cent_thresh = self.config.get("spectral_centroid_threshold", 3000.0)
16
+ rms_std_thresh = self.config.get("rms_std_threshold", 0.01)
17
+
18
+ if features["pitch_std"] < pitch_std_thresh:
19
+ score += 1
20
+ reasons.append("Unnaturally stable pitch detected")
21
+
22
+ if features["spectral_centroid_mean"] > spec_cent_thresh:
23
+ score += 1
24
+ reasons.append("Overly smooth spectral characteristics")
25
+
26
+ if features["rms_std"] < rms_std_thresh:
27
+ score += 1
28
+ reasons.append("Low energy variation typical of synthetic speech")
29
+
30
+ if score >= 2:
31
+ return "AI_GENERATED", 0.65, "; ".join(reasons)
32
+
33
+ return "HUMAN", 0.55, "Natural human-like speech dynamics observed"
src/pipeline/detector.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import numpy as np
3
+ import os
4
+ import sys
5
+
6
+ # Add src to path if needed, or rely on root execution
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
8
+
9
+ from src.utils.audio import load_audio, to_tensor
10
+ from src.components.feature_extractor import FeatureExtractor
11
+ from src.components.rule_based import RuleBasedDetector
12
+ from src.components.model_wrapper import ModelWrapper
13
+ from src.utils.compatibility import apply_patches
14
+
15
+ # Apply dependency patches immediately
16
+ apply_patches()
17
+
18
+ class VoicePipeline:
19
+ def __init__(self, config_path: str = "config/hparams.yaml"):
20
+ self.config = self._load_config(config_path)
21
+
22
+ # Initialize components
23
+ self.feature_extractor = FeatureExtractor()
24
+ self.rule_detector = RuleBasedDetector(self.config.get("rules", {}))
25
+ self.model_wrapper = ModelWrapper(self.config.get("model", {}))
26
+ self.model_wrapper.config["vad"] = self.config.get("vad", {}) # Pass VAD config if separate
27
+ self.model_wrapper.load_vad() # Ensure VAD loaded
28
+
29
+ def _load_config(self, path: str) -> dict:
30
+ if not os.path.exists(path):
31
+ # Fallback default if config missing
32
+ print(f"Config not found at {path}, using defaults.")
33
+ return {}
34
+ with open(path, 'r') as f:
35
+ return yaml.safe_load(f)
36
+
37
+ def process(self, audio_bytes: bytes) -> dict:
38
+ """
39
+ Process audio bytes and return classification result.
40
+ """
41
+ try:
42
+ # 1. Load Audio
43
+ audio_array, sr = load_audio(audio_bytes)
44
+
45
+ # 2. Extract Features
46
+ features = self.feature_extractor.extract(audio_array, sr)
47
+
48
+ # 3. Rule-Based Check
49
+ rule_label, rule_prob, rule_expl = self.rule_detector.predict(features)
50
+
51
+ # 4. Model Prediction
52
+ # Convert to tensor for model
53
+ audio_tensor = to_tensor(audio_array)
54
+ model_prob = self.model_wrapper.predict(audio_tensor, sr)
55
+
56
+ # 5. Ensemble Logic
57
+ # If Model is very confident, trust it.
58
+ # If Model is unsure, check Rules.
59
+
60
+ # Weights from config
61
+ w_model = self.config.get("pipeline", {}).get("weights", {}).get("model", 0.7)
62
+ w_rules = self.config.get("pipeline", {}).get("weights", {}).get("rules", 0.3)
63
+
64
+ # Normalize rule prob (0.55/0.65 are arbitrary from reference, let's map to 0-1)
65
+ # If HUMAN (0.55) -> 0.2? If AI (0.65) -> 0.8?
66
+ # Let's just use the raw prob from rule detector if it makes sense,
67
+ # but rule detector retuns 0.65 for AI... that's low confidence.
68
+ # Let's map "AI_GENERATED" to 0.9 and "HUMAN" to 0.1 for the sake of weighted average
69
+ rule_score = 0.9 if rule_label == "AI_GENERATED" else 0.1
70
+
71
+ combined_score = (model_prob * w_model) + (rule_score * w_rules)
72
+
73
+ # Thresholds
74
+ thresh_ai = self.config.get("pipeline", {}).get("thresholds", {}).get("ai_generated", 0.70)
75
+
76
+ if combined_score >= thresh_ai:
77
+ final_label = "AI_GENERATED"
78
+ explanation = f"Detected synthetic patterns (Model: {model_prob:.2f}, Rules: {rule_label})"
79
+ else:
80
+ final_label = "HUMAN"
81
+ explanation = f"Natural speech patterns (Model: {model_prob:.2f}, Rules: {rule_label})"
82
+
83
+ return {
84
+ "classification": final_label,
85
+ "confidenceScore": float(combined_score),
86
+ "explanation": explanation,
87
+ "details": {
88
+ "model_probability": float(model_prob),
89
+ "rule_classification": rule_label,
90
+ "features": features # Optional: return features for debug
91
+ }
92
+ }
93
+
94
+ except Exception as e:
95
+ import traceback
96
+ traceback.print_exc()
97
+ return {
98
+ "error": str(e)
99
+ }
src/utils/audio.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ import torchaudio
6
+ import soundfile as sf
7
+
8
+ def load_audio(audio_bytes: bytes, target_sr: int = 16000, max_duration: int = 5) -> tuple[np.ndarray, int]:
9
+ """
10
+ Load audio from bytes, resample if necessary, and truncate/pad.
11
+ Returns (audio_array, sample_rate).
12
+ """
13
+ try:
14
+ # Load using librosa (handles various formats via soundfile/audioread)
15
+ # mono=True mixes down to mono
16
+ audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=target_sr, mono=True)
17
+
18
+ # Truncate
19
+ max_samples = int(target_sr * max_duration)
20
+ if len(audio) > max_samples:
21
+ audio = audio[:max_samples]
22
+
23
+ return audio, sr
24
+ except Exception as e:
25
+ raise ValueError(f"Failed to load audio: {e}")
26
+
27
+ def to_tensor(audio_array: np.ndarray) -> torch.Tensor:
28
+ """Convert numpy array to torch tensor."""
29
+ return torch.tensor(audio_array).float()
src/utils/compatibility.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+
3
+ def apply_patches():
4
+ """
5
+ Apply compatibility patches for dependencies.
6
+ """
7
+ # SpeechBrain compatibility fix for torchaudio >= 2.1
8
+ # Ensures list_audio_backends exists
9
+ if not hasattr(torchaudio, "list_audio_backends"):
10
+ def _list_audio_backends():
11
+ return ["soundfile"]
12
+ torchaudio.list_audio_backends = _list_audio_backends
13
+
14
+ # SpeechBrain (<=1.0.3) passes 'use_auth_token' which was removed in huggingface_hub >= 0.23.0
15
+ # Patch huggingface_hub.hf_hub_download to remap the argument
16
+ import huggingface_hub
17
+ from huggingface_hub import utils as hf_utils
18
+
19
+ _original_hf_hub_download = huggingface_hub.hf_hub_download
20
+
21
+ def _patched_hf_hub_download(*args, **kwargs):
22
+ if "use_auth_token" in kwargs:
23
+ # Remap to 'token' or remove if redundant (hf_hub_download handles 'token')
24
+ token_val = kwargs.pop("use_auth_token")
25
+ # Only set token if not already present
26
+ if "token" not in kwargs:
27
+ kwargs["token"] = token_val
28
+ return _original_hf_hub_download(*args, **kwargs)
29
+
30
+ huggingface_hub.hf_hub_download = _patched_hf_hub_download
31
+