keet-streaming / src /lib /vad /EnergyVAD.ts
ysdede's picture
feat(space): migrate Hugging Face Space to keet SolidJS app
b8cc2bf
import { EnergyVADConfig, VADResult } from './types';
/**
* EnergyVAD implements SNR-based Voice Activity Detection with adaptive noise floor tracking.
* Ported from legacy UI project's AudioSegmentProcessor for robust speech detection.
*
* Features:
* - Adaptive noise floor with fast/slow adaptation rates
* - SNR-based speech detection (dB scale)
* - Duration-based hysteresis to filter transients
*/
export class EnergyVAD {
private config: EnergyVADConfig;
private isSpeechActive: boolean = false;
// Timers/Counters in frames
private speechConfirmationCounter: number = 0;
private silenceConfirmationCounter: number = 0;
// Transition thresholds in frames
private minSpeechFrames: number;
private minSilenceFrames: number;
// Adaptive noise floor tracking (from legacy UI project)
private noiseFloor: number = 0.005; // Initial estimate (lower for better sensitivity)
private snr: number = 0;
private silenceDuration: number = 0; // Track silence duration for adaptation rate
// Adaptation rates (from legacy UI project/audioParams.js)
private readonly noiseFloorAdaptationRate = 0.05; // Standard EMA rate
private readonly fastAdaptationRate = 0.15; // Fast rate for initial calibration
private readonly minBackgroundDuration = 1.0; // Seconds before switching to slow adaptation
private readonly snrThreshold = 3.0; // SNR threshold in dB for speech detection
constructor(config: Partial<EnergyVADConfig> = {}) {
this.config = {
energyThreshold: 0.02, // Fallback energy threshold
minSpeechDuration: 100,
minSilenceDuration: 300,
sampleRate: 16000,
...config,
};
this.minSpeechFrames = Math.ceil((this.config.minSpeechDuration / 1000) * this.config.sampleRate);
this.minSilenceFrames = Math.ceil((this.config.minSilenceDuration / 1000) * this.config.sampleRate);
}
/**
* Process an audio chunk and return the VAD state.
* Uses SNR-based detection with adaptive noise floor (from legacy UI project).
* @param chunk - Float32Array of mono PCM samples
*/
process(chunk: Float32Array): VADResult {
// 1. Calculate RMS Energy
let sumSquares = 0;
for (let i = 0; i < chunk.length; i++) {
sumSquares += chunk[i] * chunk[i];
}
const energy = Math.sqrt(sumSquares / chunk.length);
const timestamp = Date.now();
const chunkDuration = chunk.length / this.config.sampleRate;
// 2. Calculate SNR in dB (before updating noise floor)
const safeNoiseFloor = Math.max(0.0001, this.noiseFloor);
this.snr = 10 * Math.log10(energy / safeNoiseFloor);
// 3. Determine if speech based on SNR threshold (primary) and energy threshold (fallback)
const isAboveSnrThreshold = this.snr > this.snrThreshold;
const isAboveEnergyThreshold = energy > this.config.energyThreshold;
const isSpeech = isAboveSnrThreshold || isAboveEnergyThreshold;
// 4. Update noise floor adaptively (from legacy UI project)
if (!isSpeech) {
// Track silence duration for adaptation rate blending
this.silenceDuration += chunkDuration;
// Blend between fast and normal adaptation rates based on silence duration
let adaptationRate = this.noiseFloorAdaptationRate;
if (this.silenceDuration < this.minBackgroundDuration) {
const blendFactor = Math.min(1, this.silenceDuration / this.minBackgroundDuration);
adaptationRate = this.fastAdaptationRate * (1 - blendFactor) +
this.noiseFloorAdaptationRate * blendFactor;
}
// Exponential moving average for noise floor
this.noiseFloor = this.noiseFloor * (1 - adaptationRate) + energy * adaptationRate;
this.noiseFloor = Math.max(0.00001, this.noiseFloor); // Prevent zero
} else {
// Reset silence duration when speech detected
this.silenceDuration = 0;
}
const chunkLength = chunk.length;
let speechStart = false;
let speechEnd = false;
if (isSpeech) {
// Speech detected
this.silenceConfirmationCounter = 0;
if (!this.isSpeechActive) {
this.speechConfirmationCounter += chunkLength;
if (this.speechConfirmationCounter >= this.minSpeechFrames) {
this.isSpeechActive = true;
speechStart = true;
}
}
} else {
// Silence detected
this.speechConfirmationCounter = 0;
if (this.isSpeechActive) {
this.silenceConfirmationCounter += chunkLength;
if (this.silenceConfirmationCounter >= this.minSilenceFrames) {
this.isSpeechActive = false;
speechEnd = true;
this.silenceConfirmationCounter = 0;
}
}
}
return {
isSpeech: this.isSpeechActive,
energy,
timestamp,
speechStart,
speechEnd,
// Extended metrics
noiseFloor: this.noiseFloor,
snr: this.snr
} as VADResult;
}
/**
* Reset the internal state machine.
*/
reset(): void {
this.isSpeechActive = false;
this.speechConfirmationCounter = 0;
this.silenceConfirmationCounter = 0;
this.noiseFloor = 0.005; // Reset to initial estimate
this.snr = 0;
this.silenceDuration = 0;
}
/**
* Update configuration at runtime.
*/
updateConfig(config: Partial<EnergyVADConfig>): void {
this.config = { ...this.config, ...config };
this.minSpeechFrames = Math.ceil((this.config.minSpeechDuration / 1000) * this.config.sampleRate);
this.minSilenceFrames = Math.ceil((this.config.minSilenceDuration / 1000) * this.config.sampleRate);
}
/**
* Get the current configuration.
*/
getConfig(): EnergyVADConfig {
return { ...this.config };
}
}