import numpy as np import torch class StreamingVAD: """Wraps Silero VAD for streaming chunk-by-chunk speech detection.""" def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400): from silero_vad import VADIterator self.iterator = VADIterator( model, sampling_rate=16000, threshold=threshold, min_silence_duration_ms=min_silence_ms, ) self.audio_buffer: list[np.ndarray] = [] self.is_speaking = False def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None: """Feed a 512-sample chunk at 16kHz. Returns the complete utterance as a numpy array when speech ends, or None if still accumulating. """ tensor = torch.from_numpy(chunk_16k).float() speech_dict = self.iterator(tensor, return_seconds=False) if speech_dict: if "start" in speech_dict: self.is_speaking = True self.audio_buffer = [] if "end" in speech_dict: self.is_speaking = False if self.audio_buffer: result = np.concatenate(self.audio_buffer) self.audio_buffer = [] self.iterator.reset_states() return result self.iterator.reset_states() return None if self.is_speaking: self.audio_buffer.append(chunk_16k.copy()) return None def reset(self): """Reset VAD state for a new conversation turn.""" self.audio_buffer = [] self.is_speaking = False self.iterator.reset_states()