Switch to CUDA 12.8 + ONNX-based VAD for RTX 5090 Blackwell support

Upgrade PyTorch to 2.7+ with cu128 wheels for Blackwell (sm_120) GPU support. Replace silero-vad (which depends on torchaudio) with a direct ONNX Runtime implementation of the same Silero VAD model, eliminating the torchaudio dependency entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 04:01:27 -04:00
parent 0305f1dccd
commit 263f39e0a3
4 changed files with 77 additions and 45 deletions
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
@@ -12,9 +12,6 @@ RUN apt-get update && apt-get install -y \
    git \
    ffmpeg \
    curl \
    cmake \
    ninja-build \
    build-essential \
    && rm -rf /var/lib/apt/lists/*
 # Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
@@ -24,23 +21,15 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python
 WORKDIR /app
-# Build PyTorch from source with Blackwell (sm_120) support
+# Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support)
 RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
    cd /tmp/pytorch && \
    git submodule update --init --recursive && \
    TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
    python3.11 setup.py install && \
    cd / && rm -rf /tmp/pytorch
 # Install torchvision and torchaudio with CUDA 12.1 support
 RUN python3.11 -m pip install --no-cache-dir \
-    torchvision torchaudio \
+    torch torchvision \
-    --index-url https://download.pytorch.org/whl/cu121
+    --index-url https://download.pytorch.org/whl/cu128
-# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
+# Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source)
 RUN python3.11 -m pip install --no-cache-dir \
    "auto-gptq>=0.7.1" \
-    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
+    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/
 # Install the rest of the app requirements
 COPY requirements.txt .
@@ -1,9 +1,10 @@
 # torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
-# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
+# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu128
 transformers==4.57.6
 optimum>=1.19
 compressed-tensors>=0.5.0
-silero-vad>=5.1
+onnxruntime>=1.17.0
 huggingface-hub>=0.20.0
 qwen-asr==0.0.6
 kokoro==0.9.4
 fastapi>=0.115.0
@@ -41,11 +41,15 @@ class ModelManager:
        log.info("All models loaded successfully.")
    def _load_vad(self):
-        log.info("Loading Silero VAD...")
+        log.info("Loading Silero VAD (ONNX)...")
-        from silero_vad import load_silero_vad
+        from huggingface_hub import hf_hub_download
        from server.vad import SileroVADOnnx
-        self.vad_model = load_silero_vad()
+        model_path = hf_hub_download(
-        log.info("Silero VAD loaded (CPU).")
+            repo_id="onnx-community/silero-vad", filename="silero_vad.onnx"
        )
        self.vad_model = SileroVADOnnx(model_path)
        log.info("Silero VAD loaded (ONNX, CPU).")
    def _load_asr(self):
        log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
@@ -1,21 +1,57 @@
 import numpy as np
-import torch
+import onnxruntime
 class SileroVADOnnx:
    """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency)."""
    SAMPLE_RATE = 16000
    WINDOW_SIZE = 512  # 32ms at 16kHz
    def __init__(self, model_path: str):
        opts = onnxruntime.SessionOptions()
        opts.inter_op_num_threads = 1
        opts.intra_op_num_threads = 1
        self.session = onnxruntime.InferenceSession(
            model_path, sess_options=opts
        )
        self._reset_state()
    def _reset_state(self):
        self._h = np.zeros((2, 1, 64), dtype=np.float32)
        self._c = np.zeros((2, 1, 64), dtype=np.float32)
    def reset_states(self):
        self._reset_state()
    def __call__(self, chunk: np.ndarray) -> float:
        """Run VAD on a single audio chunk. Returns speech probability."""
        input_data = chunk[np.newaxis, :]  # add batch dim
        sr = np.array(self.SAMPLE_RATE, dtype=np.int64)
        ort_inputs = {
            "input": input_data,
            "sr": sr,
            "h": self._h,
            "c": self._c,
        }
        out, self._h, self._c = self.session.run(None, ort_inputs)
        return float(out.squeeze())
 class StreamingVAD:
-    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""
+    """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection."""
-    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
+    def __init__(self, model: SileroVADOnnx, threshold: float = 0.5,
-        from silero_vad import VADIterator
+                 min_silence_ms: int = 400):
-
+        self.model = model
-        self.iterator = VADIterator(
+        self.threshold = threshold
-            model,
+        self.min_silence_samples = int(
-            sampling_rate=16000,
+            SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000
            threshold=threshold,
            min_silence_duration_ms=min_silence_ms,
        )
        self.audio_buffer: list[np.ndarray] = []
        self.is_speaking = False
        self._silence_samples = 0
    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
        """Feed a 512-sample chunk at 16kHz.
@@ -23,25 +59,26 @@ class StreamingVAD:
        Returns the complete utterance as a numpy array when speech ends,
        or None if still accumulating.
        """
-        tensor = torch.from_numpy(chunk_16k).float()
+        prob = self.model(chunk_16k.astype(np.float32))
        speech_dict = self.iterator(tensor, return_seconds=False)
-        if speech_dict:
+        if prob >= self.threshold:
-            if "start" in speech_dict:
+            self._silence_samples = 0
            if not self.is_speaking:
                self.is_speaking = True
                self.audio_buffer = []
-            if "end" in speech_dict:
+            self.audio_buffer.append(chunk_16k.copy())
        elif self.is_speaking:
            self._silence_samples += len(chunk_16k)
            self.audio_buffer.append(chunk_16k.copy())
            if self._silence_samples >= self.min_silence_samples:
                self.is_speaking = False
                self._silence_samples = 0
                if self.audio_buffer:
                    result = np.concatenate(self.audio_buffer)
                    self.audio_buffer = []
-                    self.iterator.reset_states()
+                    self.model.reset_states()
                    return result
-                self.iterator.reset_states()
+                self.model.reset_states()
                return None
        if self.is_speaking:
            self.audio_buffer.append(chunk_16k.copy())
        return None
@@ -49,4 +86,5 @@ class StreamingVAD:
        """Reset VAD state for a new conversation turn."""
        self.audio_buffer = []
        self.is_speaking = False
-        self.iterator.reset_states()
+        self._silence_samples = 0
        self.model.reset_states()