Switch to CUDA 12.8 + ONNX-based VAD for RTX 5090 Blackwell support

Upgrade PyTorch to 2.7+ with cu128 wheels for Blackwell (sm_120) GPU support. Replace silero-vad (which depends on torchaudio) with a direct ONNX Runtime implementation of the same Silero VAD model, eliminating the torchaudio dependency entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 04:01:27 -04:00
parent 0305f1dccd
commit 263f39e0a3
4 changed files with 77 additions and 45 deletions
@@ -1,21 +1,57 @@
 import numpy as np
-import torch
+import onnxruntime
+
+
+class SileroVADOnnx:
+    """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency)."""
+
+    SAMPLE_RATE = 16000
+    WINDOW_SIZE = 512  # 32ms at 16kHz
+
+    def __init__(self, model_path: str):
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        self.session = onnxruntime.InferenceSession(
+            model_path, sess_options=opts
+        )
+        self._reset_state()
+
+    def _reset_state(self):
+        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+
+    def reset_states(self):
+        self._reset_state()
+
+    def __call__(self, chunk: np.ndarray) -> float:
+        """Run VAD on a single audio chunk. Returns speech probability."""
+        input_data = chunk[np.newaxis, :]  # add batch dim
+        sr = np.array(self.SAMPLE_RATE, dtype=np.int64)
+
+        ort_inputs = {
+            "input": input_data,
+            "sr": sr,
+            "h": self._h,
+            "c": self._c,
+        }
+        out, self._h, self._c = self.session.run(None, ort_inputs)
+        return float(out.squeeze())


 class StreamingVAD:
-    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""
+    """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection."""

-    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
-        from silero_vad import VADIterator
-
-        self.iterator = VADIterator(
-            model,
-            sampling_rate=16000,
-            threshold=threshold,
-            min_silence_duration_ms=min_silence_ms,
+    def __init__(self, model: SileroVADOnnx, threshold: float = 0.5,
+                 min_silence_ms: int = 400):
+        self.model = model
+        self.threshold = threshold
+        self.min_silence_samples = int(
+            SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000
        )
        self.audio_buffer: list[np.ndarray] = []
        self.is_speaking = False
+        self._silence_samples = 0

    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
        """Feed a 512-sample chunk at 16kHz.
@@ -23,25 +59,26 @@ class StreamingVAD:
        Returns the complete utterance as a numpy array when speech ends,
        or None if still accumulating.
        """
-        tensor = torch.from_numpy(chunk_16k).float()
-        speech_dict = self.iterator(tensor, return_seconds=False)
+        prob = self.model(chunk_16k.astype(np.float32))

-        if speech_dict:
-            if "start" in speech_dict:
+        if prob >= self.threshold:
+            self._silence_samples = 0
+            if not self.is_speaking:
                self.is_speaking = True
                self.audio_buffer = []
-            if "end" in speech_dict:
+            self.audio_buffer.append(chunk_16k.copy())
+        elif self.is_speaking:
+            self._silence_samples += len(chunk_16k)
+            self.audio_buffer.append(chunk_16k.copy())
+            if self._silence_samples >= self.min_silence_samples:
                self.is_speaking = False
+                self._silence_samples = 0
                if self.audio_buffer:
                    result = np.concatenate(self.audio_buffer)
                    self.audio_buffer = []
-                    self.iterator.reset_states()
+                    self.model.reset_states()
                    return result
-                self.iterator.reset_states()
-                return None
-
-        if self.is_speaking:
-            self.audio_buffer.append(chunk_16k.copy())
+                self.model.reset_states()

        return None

@@ -49,4 +86,5 @@ class StreamingVAD:
        """Reset VAD state for a new conversation turn."""
        self.audio_buffer = []
        self.is_speaking = False
-        self.iterator.reset_states()
+        self._silence_samples = 0
+        self.model.reset_states()