import numpy as np import onnxruntime class SileroVADOnnx: """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency).""" SAMPLE_RATE = 16000 WINDOW_SIZE = 512 # 32ms at 16kHz def __init__(self, model_path: str): opts = onnxruntime.SessionOptions() opts.inter_op_num_threads = 1 opts.intra_op_num_threads = 1 self.session = onnxruntime.InferenceSession( model_path, sess_options=opts ) self._reset_state() def _reset_state(self): self._state = np.zeros((2, 1, 128), dtype=np.float32) def reset_states(self): self._reset_state() def __call__(self, chunk: np.ndarray) -> float: """Run VAD on a single audio chunk. Returns speech probability.""" input_data = chunk[np.newaxis, :] # add batch dim sr = np.array(self.SAMPLE_RATE, dtype=np.int64) ort_inputs = { "input": input_data, "sr": sr, "state": self._state, } out, self._state = self.session.run(None, ort_inputs) return float(out.squeeze()) class StreamingVAD: """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection.""" def __init__(self, model: SileroVADOnnx, threshold: float = 0.5, min_silence_ms: int = 400): self.model = model self.threshold = threshold self.min_silence_samples = int( SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000 ) self.audio_buffer: list[np.ndarray] = [] self.is_speaking = False self._silence_samples = 0 def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None: """Feed a 512-sample chunk at 16kHz. Returns the complete utterance as a numpy array when speech ends, or None if still accumulating. """ prob = self.model(chunk_16k.astype(np.float32)) if prob >= self.threshold: self._silence_samples = 0 if not self.is_speaking: self.is_speaking = True self.audio_buffer = [] self.audio_buffer.append(chunk_16k.copy()) elif self.is_speaking: self._silence_samples += len(chunk_16k) self.audio_buffer.append(chunk_16k.copy()) if self._silence_samples >= self.min_silence_samples: self.is_speaking = False self._silence_samples = 0 if self.audio_buffer: result = np.concatenate(self.audio_buffer) self.audio_buffer = [] self.model.reset_states() return result self.model.reset_states() return None def reset(self): """Reset VAD state for a new conversation turn.""" self.audio_buffer = [] self.is_speaking = False self._silence_samples = 0 self.model.reset_states()