diff --git a/Dockerfile b/Dockerfile index 049b53f..d41be01 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 +FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 @@ -12,9 +12,6 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg \ curl \ - cmake \ - ninja-build \ - build-essential \ && rm -rf /var/lib/apt/lists/* # Bootstrap pip for python3.11 (Debian disables ensurepip for system Python) @@ -24,23 +21,15 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python WORKDIR /app -# Build PyTorch from source with Blackwell (sm_120) support -RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \ - cd /tmp/pytorch && \ - git submodule update --init --recursive && \ - TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \ - python3.11 setup.py install && \ - cd / && rm -rf /tmp/pytorch - -# Install torchvision and torchaudio with CUDA 12.1 support +# Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support) RUN python3.11 -m pip install --no-cache-dir \ - torchvision torchaudio \ - --index-url https://download.pytorch.org/whl/cu121 + torch torchvision \ + --index-url https://download.pytorch.org/whl/cu128 -# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source) +# Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source) RUN python3.11 -m pip install --no-cache-dir \ "auto-gptq>=0.7.1" \ - --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/ + --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/ # Install the rest of the app requirements COPY requirements.txt . diff --git a/requirements.txt b/requirements.txt index 8c90877..37c8d79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ # torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs. -# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121 +# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu128 transformers==4.57.6 optimum>=1.19 compressed-tensors>=0.5.0 -silero-vad>=5.1 +onnxruntime>=1.17.0 +huggingface-hub>=0.20.0 qwen-asr==0.0.6 kokoro==0.9.4 fastapi>=0.115.0 diff --git a/server/models.py b/server/models.py index 35e059a..38aff8d 100644 --- a/server/models.py +++ b/server/models.py @@ -41,11 +41,15 @@ class ModelManager: log.info("All models loaded successfully.") def _load_vad(self): - log.info("Loading Silero VAD...") - from silero_vad import load_silero_vad + log.info("Loading Silero VAD (ONNX)...") + from huggingface_hub import hf_hub_download + from server.vad import SileroVADOnnx - self.vad_model = load_silero_vad() - log.info("Silero VAD loaded (CPU).") + model_path = hf_hub_download( + repo_id="onnx-community/silero-vad", filename="silero_vad.onnx" + ) + self.vad_model = SileroVADOnnx(model_path) + log.info("Silero VAD loaded (ONNX, CPU).") def _load_asr(self): log.info("Loading Qwen3-ASR-0.6B (transformers backend)...") diff --git a/server/vad.py b/server/vad.py index 887f94c..15f28f0 100644 --- a/server/vad.py +++ b/server/vad.py @@ -1,21 +1,57 @@ import numpy as np -import torch +import onnxruntime + + +class SileroVADOnnx: + """Silero VAD model loaded via ONNX Runtime (no torchaudio dependency).""" + + SAMPLE_RATE = 16000 + WINDOW_SIZE = 512 # 32ms at 16kHz + + def __init__(self, model_path: str): + opts = onnxruntime.SessionOptions() + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + self.session = onnxruntime.InferenceSession( + model_path, sess_options=opts + ) + self._reset_state() + + def _reset_state(self): + self._h = np.zeros((2, 1, 64), dtype=np.float32) + self._c = np.zeros((2, 1, 64), dtype=np.float32) + + def reset_states(self): + self._reset_state() + + def __call__(self, chunk: np.ndarray) -> float: + """Run VAD on a single audio chunk. Returns speech probability.""" + input_data = chunk[np.newaxis, :] # add batch dim + sr = np.array(self.SAMPLE_RATE, dtype=np.int64) + + ort_inputs = { + "input": input_data, + "sr": sr, + "h": self._h, + "c": self._c, + } + out, self._h, self._c = self.session.run(None, ort_inputs) + return float(out.squeeze()) class StreamingVAD: - """Wraps Silero VAD for streaming chunk-by-chunk speech detection.""" + """Wraps Silero VAD (ONNX) for streaming chunk-by-chunk speech detection.""" - def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400): - from silero_vad import VADIterator - - self.iterator = VADIterator( - model, - sampling_rate=16000, - threshold=threshold, - min_silence_duration_ms=min_silence_ms, + def __init__(self, model: SileroVADOnnx, threshold: float = 0.5, + min_silence_ms: int = 400): + self.model = model + self.threshold = threshold + self.min_silence_samples = int( + SileroVADOnnx.SAMPLE_RATE * min_silence_ms / 1000 ) self.audio_buffer: list[np.ndarray] = [] self.is_speaking = False + self._silence_samples = 0 def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None: """Feed a 512-sample chunk at 16kHz. @@ -23,25 +59,26 @@ class StreamingVAD: Returns the complete utterance as a numpy array when speech ends, or None if still accumulating. """ - tensor = torch.from_numpy(chunk_16k).float() - speech_dict = self.iterator(tensor, return_seconds=False) + prob = self.model(chunk_16k.astype(np.float32)) - if speech_dict: - if "start" in speech_dict: + if prob >= self.threshold: + self._silence_samples = 0 + if not self.is_speaking: self.is_speaking = True self.audio_buffer = [] - if "end" in speech_dict: + self.audio_buffer.append(chunk_16k.copy()) + elif self.is_speaking: + self._silence_samples += len(chunk_16k) + self.audio_buffer.append(chunk_16k.copy()) + if self._silence_samples >= self.min_silence_samples: self.is_speaking = False + self._silence_samples = 0 if self.audio_buffer: result = np.concatenate(self.audio_buffer) self.audio_buffer = [] - self.iterator.reset_states() + self.model.reset_states() return result - self.iterator.reset_states() - return None - - if self.is_speaking: - self.audio_buffer.append(chunk_16k.copy()) + self.model.reset_states() return None @@ -49,4 +86,5 @@ class StreamingVAD: """Reset VAD state for a new conversation turn.""" self.audio_buffer = [] self.is_speaking = False - self.iterator.reset_states() + self._silence_samples = 0 + self.model.reset_states()