Switch to CUDA 12.8 + ONNX-based VAD for RTX 5090 Blackwell support

Upgrade PyTorch to 2.7+ with cu128 wheels for Blackwell (sm_120) GPU support. Replace silero-vad (which depends on torchaudio) with a direct ONNX Runtime implementation of the same Silero VAD model, eliminating the torchaudio dependency entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 04:01:27 -04:00
parent 0305f1dccd
commit 263f39e0a3
4 changed files with 77 additions and 45 deletions
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04

 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
@@ -12,9 +12,6 @@ RUN apt-get update && apt-get install -y \
    git \
    ffmpeg \
    curl \
-    cmake \
-    ninja-build \
-    build-essential \
    && rm -rf /var/lib/apt/lists/*

 # Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
@@ -24,23 +21,15 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python

 WORKDIR /app

-# Build PyTorch from source with Blackwell (sm_120) support
-RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
-    cd /tmp/pytorch && \
-    git submodule update --init --recursive && \
-    TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
-    python3.11 setup.py install && \
-    cd / && rm -rf /tmp/pytorch
-
-# Install torchvision and torchaudio with CUDA 12.1 support
+# Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support)
 RUN python3.11 -m pip install --no-cache-dir \
-    torchvision torchaudio \
-    --index-url https://download.pytorch.org/whl/cu121
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128

-# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
+# Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source)
 RUN python3.11 -m pip install --no-cache-dir \
    "auto-gptq>=0.7.1" \
-    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
+    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/

 # Install the rest of the app requirements
 COPY requirements.txt .
@@ -1,9 +1,10 @@
 # torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
-# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
+# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu128
 transformers==4.57.6
 optimum>=1.19
 compressed-tensors>=0.5.0
-silero-vad>=5.1
+onnxruntime>=1.17.0
+huggingface-hub>=0.20.0
 qwen-asr==0.0.6
 kokoro==0.9.4
 fastapi>=0.115.0
@@ -41,11 +41,15 @@ class ModelManager:
        log.info("All models loaded successfully.")

    def _load_vad(self):
-        log.info("Loading Silero VAD...")
-        from silero_vad import load_silero_vad
+        log.info("Loading Silero VAD (ONNX)...")
+        from huggingface_hub import hf_hub_download
+        from server.vad import SileroVADOnnx

-        self.vad_model = load_silero_vad()
-        log.info("Silero VAD loaded (CPU).")
+        model_path = hf_hub_download(
+            repo_id="onnx-community/silero-vad", filename="silero_vad.onnx"
+        )
+        self.vad_model = SileroVADOnnx(model_path)
+        log.info("Silero VAD loaded (ONNX, CPU).")

    def _load_asr(self):
        log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
@@ -1,21 +1,57 @@
 import numpy as np
-import torch
+import onnxruntime
+
+
+class SileroVADOnnx:
+    """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency)."""
+
+    SAMPLE_RATE = 16000
+    WINDOW_SIZE = 512  # 32ms at 16kHz
+
+    def __init__(self, model_path: str):
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        self.session = onnxruntime.InferenceSession(
+            model_path, sess_options=opts
+        )
+        self._reset_state()
+
+    def _reset_state(self):
+        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+
+    def reset_states(self):
+        self._reset_state()
+
+    def __call__(self, chunk: np.ndarray) -> float:
+        """Run VAD on a single audio chunk. Returns speech probability."""
+        input_data = chunk[np.newaxis, :]  # add batch dim
+        sr = np.array(self.SAMPLE_RATE, dtype=np.int64)
+
+        ort_inputs = {
+            "input": input_data,
+            "sr": sr,
+            "h": self._h,
+            "c": self._c,
+        }
+        out, self._h, self._c = self.session.run(None, ort_inputs)
+        return float(out.squeeze())


 class StreamingVAD:
-    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""
+    """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection."""

-    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
-        from silero_vad import VADIterator
-
-        self.iterator = VADIterator(
-            model,
-            sampling_rate=16000,
-            threshold=threshold,
-            min_silence_duration_ms=min_silence_ms,
+    def __init__(self, model: SileroVADOnnx, threshold: float = 0.5,
+                 min_silence_ms: int = 400):
+        self.model = model
+        self.threshold = threshold
+        self.min_silence_samples = int(
+            SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000
        )
        self.audio_buffer: list[np.ndarray] = []
        self.is_speaking = False
+        self._silence_samples = 0

    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
        """Feed a 512-sample chunk at 16kHz.
@@ -23,25 +59,26 @@ class StreamingVAD:
        Returns the complete utterance as a numpy array when speech ends,
        or None if still accumulating.
        """
-        tensor = torch.from_numpy(chunk_16k).float()
-        speech_dict = self.iterator(tensor, return_seconds=False)
+        prob = self.model(chunk_16k.astype(np.float32))

-        if speech_dict:
-            if "start" in speech_dict:
+        if prob >= self.threshold:
+            self._silence_samples = 0
+            if not self.is_speaking:
                self.is_speaking = True
                self.audio_buffer = []
-            if "end" in speech_dict:
+            self.audio_buffer.append(chunk_16k.copy())
+        elif self.is_speaking:
+            self._silence_samples += len(chunk_16k)
+            self.audio_buffer.append(chunk_16k.copy())
+            if self._silence_samples >= self.min_silence_samples:
                self.is_speaking = False
+                self._silence_samples = 0
                if self.audio_buffer:
                    result = np.concatenate(self.audio_buffer)
                    self.audio_buffer = []
-                    self.iterator.reset_states()
+                    self.model.reset_states()
                    return result
-                self.iterator.reset_states()
-                return None
-
-        if self.is_speaking:
-            self.audio_buffer.append(chunk_16k.copy())
+                self.model.reset_states()

        return None

@@ -49,4 +86,5 @@ class StreamingVAD:
        """Reset VAD state for a new conversation turn."""
        self.audio_buffer = []
        self.is_speaking = False
-        self.iterator.reset_states()
+        self._silence_samples = 0
+        self.model.reset_states()