live-voice-chat/server/vad.py

import numpy as np
import torch


class StreamingVAD:
    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""

    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
        from silero_vad import VADIterator

        self.iterator = VADIterator(
            model,
            sampling_rate=16000,
            threshold=threshold,
            min_silence_duration_ms=min_silence_ms,
        )
        self.audio_buffer: list[np.ndarray] = []
        self.is_speaking = False

    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
        """Feed a 512-sample chunk at 16kHz.

        Returns the complete utterance as a numpy array when speech ends,
        or None if still accumulating.
        """
        tensor = torch.from_numpy(chunk_16k).float()
        speech_dict = self.iterator(tensor, return_seconds=False)

        if speech_dict:
            if "start" in speech_dict:
                self.is_speaking = True
                self.audio_buffer = []
            if "end" in speech_dict:
                self.is_speaking = False
                if self.audio_buffer:
                    result = np.concatenate(self.audio_buffer)
                    self.audio_buffer = []
                    self.iterator.reset_states()
                    return result
                self.iterator.reset_states()
                return None

        if self.is_speaking:
            self.audio_buffer.append(chunk_16k.copy())

        return None

    def reset(self):
        """Reset VAD state for a new conversation turn."""
        self.audio_buffer = []
        self.is_speaking = False
        self.iterator.reset_states()