diff --git a/Dockerfile b/Dockerfile
index 049b53f..d41be01 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
@@ -12,9 +12,6 @@ RUN apt-get update && apt-get install -y \
     git \
     ffmpeg \
     curl \
-    cmake \
-    ninja-build \
-    build-essential \
     && rm -rf /var/lib/apt/lists/*
 
 # Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
@@ -24,23 +21,15 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python
 
 WORKDIR /app
 
-# Build PyTorch from source with Blackwell (sm_120) support
-RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
-    cd /tmp/pytorch && \
-    git submodule update --init --recursive && \
-    TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
-    python3.11 setup.py install && \
-    cd / && rm -rf /tmp/pytorch
-
-# Install torchvision and torchaudio with CUDA 12.1 support
+# Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support)
 RUN python3.11 -m pip install --no-cache-dir \
-    torchvision torchaudio \
-    --index-url https://download.pytorch.org/whl/cu121
+    torch torchvision \
+    --index-url https://download.pytorch.org/whl/cu128
 
-# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
+# Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source)
 RUN python3.11 -m pip install --no-cache-dir \
     "auto-gptq>=0.7.1" \
-    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
+    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/
 
 # Install the rest of the app requirements
 COPY requirements.txt .
diff --git a/requirements.txt b/requirements.txt
index 8c90877..37c8d79 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,10 @@
 # torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
-# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
+# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu128
 transformers==4.57.6
 optimum>=1.19
 compressed-tensors>=0.5.0
-silero-vad>=5.1
+onnxruntime>=1.17.0
+huggingface-hub>=0.20.0
 qwen-asr==0.0.6
 kokoro==0.9.4
 fastapi>=0.115.0
diff --git a/server/models.py b/server/models.py
index 35e059a..38aff8d 100644
--- a/server/models.py
+++ b/server/models.py
@@ -41,11 +41,15 @@ class ModelManager:
         log.info("All models loaded successfully.")
 
     def _load_vad(self):
-        log.info("Loading Silero VAD...")
-        from silero_vad import load_silero_vad
+        log.info("Loading Silero VAD (ONNX)...")
+        from huggingface_hub import hf_hub_download
+        from server.vad import SileroVADOnnx
 
-        self.vad_model = load_silero_vad()
-        log.info("Silero VAD loaded (CPU).")
+        model_path = hf_hub_download(
+            repo_id="onnx-community/silero-vad", filename="silero_vad.onnx"
+        )
+        self.vad_model = SileroVADOnnx(model_path)
+        log.info("Silero VAD loaded (ONNX, CPU).")
 
     def _load_asr(self):
         log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
diff --git a/server/vad.py b/server/vad.py
index 887f94c..15f28f0 100644
--- a/server/vad.py
+++ b/server/vad.py
@@ -1,21 +1,57 @@
 import numpy as np
-import torch
+import onnxruntime
+
+
+class SileroVADOnnx:
+    """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency)."""
+
+    SAMPLE_RATE = 16000
+    WINDOW_SIZE = 512  # 32ms at 16kHz
+
+    def __init__(self, model_path: str):
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        self.session = onnxruntime.InferenceSession(
+            model_path, sess_options=opts
+        )
+        self._reset_state()
+
+    def _reset_state(self):
+        self._h = np.zeros((2, 1, 64), dtype=np.float32)
+        self._c = np.zeros((2, 1, 64), dtype=np.float32)
+
+    def reset_states(self):
+        self._reset_state()
+
+    def __call__(self, chunk: np.ndarray) -> float:
+        """Run VAD on a single audio chunk. Returns speech probability."""
+        input_data = chunk[np.newaxis, :]  # add batch dim
+        sr = np.array(self.SAMPLE_RATE, dtype=np.int64)
+
+        ort_inputs = {
+            "input": input_data,
+            "sr": sr,
+            "h": self._h,
+            "c": self._c,
+        }
+        out, self._h, self._c = self.session.run(None, ort_inputs)
+        return float(out.squeeze())
 
 
 class StreamingVAD:
-    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""
+    """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection."""
 
-    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
-        from silero_vad import VADIterator
-
-        self.iterator = VADIterator(
-            model,
-            sampling_rate=16000,
-            threshold=threshold,
-            min_silence_duration_ms=min_silence_ms,
+    def __init__(self, model: SileroVADOnnx, threshold: float = 0.5,
+                 min_silence_ms: int = 400):
+        self.model = model
+        self.threshold = threshold
+        self.min_silence_samples = int(
+            SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000
         )
         self.audio_buffer: list[np.ndarray] = []
         self.is_speaking = False
+        self._silence_samples = 0
 
     def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
         """Feed a 512-sample chunk at 16kHz.
@@ -23,25 +59,26 @@ class StreamingVAD:
         Returns the complete utterance as a numpy array when speech ends,
         or None if still accumulating.
         """
-        tensor = torch.from_numpy(chunk_16k).float()
-        speech_dict = self.iterator(tensor, return_seconds=False)
+        prob = self.model(chunk_16k.astype(np.float32))
 
-        if speech_dict:
-            if "start" in speech_dict:
+        if prob >= self.threshold:
+            self._silence_samples = 0
+            if not self.is_speaking:
                 self.is_speaking = True
                 self.audio_buffer = []
-            if "end" in speech_dict:
+            self.audio_buffer.append(chunk_16k.copy())
+        elif self.is_speaking:
+            self._silence_samples += len(chunk_16k)
+            self.audio_buffer.append(chunk_16k.copy())
+            if self._silence_samples >= self.min_silence_samples:
                 self.is_speaking = False
+                self._silence_samples = 0
                 if self.audio_buffer:
                     result = np.concatenate(self.audio_buffer)
                     self.audio_buffer = []
-                    self.iterator.reset_states()
+                    self.model.reset_states()
                     return result
-                self.iterator.reset_states()
-                return None
-
-        if self.is_speaking:
-            self.audio_buffer.append(chunk_16k.copy())
+                self.model.reset_states()
 
         return None
 
@@ -49,4 +86,5 @@ class StreamingVAD:
         """Reset VAD state for a new conversation turn."""
         self.audio_buffer = []
         self.is_speaking = False
-        self.iterator.reset_states()
+        self._silence_samples = 0
+        self.model.reset_states()