live-voice-chat/server/vad.py

import numpy as np
import onnxruntime


class SileroVADOnnx:
    """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency)."""

    SAMPLE_RATE = 16000
    WINDOW_SIZE = 512  # 32ms at 16kHz

    def __init__(self, model_path: str):
        opts = onnxruntime.SessionOptions()
        opts.inter_op_num_threads = 1
        opts.intra_op_num_threads = 1
        self.session = onnxruntime.InferenceSession(
            model_path, sess_options=opts
        )
        self._reset_state()

    def _reset_state(self):
        self._h = np.zeros((2, 1, 64), dtype=np.float32)
        self._c = np.zeros((2, 1, 64), dtype=np.float32)

    def reset_states(self):
        self._reset_state()

    def __call__(self, chunk: np.ndarray) -> float:
        """Run VAD on a single audio chunk. Returns speech probability."""
        input_data = chunk[np.newaxis, :]  # add batch dim
        sr = np.array(self.SAMPLE_RATE, dtype=np.int64)

        ort_inputs = {
            "input": input_data,
            "sr": sr,
            "h": self._h,
            "c": self._c,
        }
        out, self._h, self._c = self.session.run(None, ort_inputs)
        return float(out.squeeze())


class StreamingVAD:
    """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection."""

    def __init__(self, model: SileroVADOnnx, threshold: float = 0.5,
                 min_silence_ms: int = 400):
        self.model = model
        self.threshold = threshold
        self.min_silence_samples = int(
            SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000
        )
        self.audio_buffer: list[np.ndarray] = []
        self.is_speaking = False
        self._silence_samples = 0

    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
        """Feed a 512-sample chunk at 16kHz.

        Returns the complete utterance as a numpy array when speech ends,
        or None if still accumulating.
        """
        prob = self.model(chunk_16k.astype(np.float32))

        if prob >= self.threshold:
            self._silence_samples = 0
            if not self.is_speaking:
                self.is_speaking = True
                self.audio_buffer = []
            self.audio_buffer.append(chunk_16k.copy())
        elif self.is_speaking:
            self._silence_samples += len(chunk_16k)
            self.audio_buffer.append(chunk_16k.copy())
            if self._silence_samples >= self.min_silence_samples:
                self.is_speaking = False
                self._silence_samples = 0
                if self.audio_buffer:
                    result = np.concatenate(self.audio_buffer)
                    self.audio_buffer = []
                    self.model.reset_states()
                    return result
                self.model.reset_states()

        return None

    def reset(self):
        """Reset VAD state for a new conversation turn."""
        self.audio_buffer = []
        self.is_speaking = False
        self._silence_samples = 0
        self.model.reset_states()