From 0305f1dccd18eac4b1df01d80bb888c72c51521f Mon Sep 17 00:00:00 2001
From: Brian <bhetherman@gmail.com>
Date: Wed, 8 Apr 2026 03:05:26 -0400
Subject: [PATCH] updates for docker take 1

---
 .dockerignore      |  8 +++++++
 .gitignore         |  1 +
 .gitmodules        |  0
 Dockerfile         | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 docker-compose.yml | 18 ++++++++++++++++
 requirements.txt   |  7 +++++-
 server/models.py   | 28 ++++++++++++++++++------
 7 files changed, 108 insertions(+), 7 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 .gitmodules
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..7c2c8f0
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+.git/
+.gitignore
+*.md
+.env
diff --git a/.gitignore b/.gitignore
index 0e5ac79..aa47104 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .venv
+.claude
 __pycache__
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e69de29
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..049b53f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,53 @@
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+# HuggingFace model cache — mounted as a volume so models persist across runs
+ENV HF_HOME=/cache/huggingface
+
+RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-dev \
+    python3.11-venv \
+    git \
+    ffmpeg \
+    curl \
+    cmake \
+    ninja-build \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
+
+RUN ln -sf /usr/bin/python3.11 /usr/bin/python
+
+WORKDIR /app
+
+# Build PyTorch from source with Blackwell (sm_120) support
+RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
+    cd /tmp/pytorch && \
+    git submodule update --init --recursive && \
+    TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
+    python3.11 setup.py install && \
+    cd / && rm -rf /tmp/pytorch
+
+# Install torchvision and torchaudio with CUDA 12.1 support
+RUN python3.11 -m pip install --no-cache-dir \
+    torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu121
+
+# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
+RUN python3.11 -m pip install --no-cache-dir \
+    "auto-gptq>=0.7.1" \
+    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
+
+# Install the rest of the app requirements
+COPY requirements.txt .
+RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8000
+
+CMD ["python3.11", "run.py"]
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..14b9710
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,18 @@
+services:
+  voice-chat:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      # Cache models on the host so they survive container rebuilds
+      - huggingface-cache:/cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+volumes:
+  huggingface-cache:
diff --git a/requirements.txt b/requirements.txt
index bc1e861..8c90877 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,11 @@
-torch>=2.5.0
+# torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
+# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
 transformers==4.57.6
+optimum>=1.19
+compressed-tensors>=0.5.0
 silero-vad>=5.1
+qwen-asr==0.0.6
+kokoro==0.9.4
 fastapi>=0.115.0
 uvicorn[standard]>=0.30.0
 numpy
diff --git a/server/models.py b/server/models.py
index e02d521..35e059a 100644
--- a/server/models.py
+++ b/server/models.py
@@ -9,6 +9,20 @@ from server.tts import TTSEngine
 log = logging.getLogger(__name__)
 
 
+def get_device():
+    """Get the best available device (CUDA if available and working, otherwise CPU)."""
+    if torch.cuda.is_available():
+        try:
+            # Test CUDA availability
+            torch.zeros(1, device="cuda:0")
+            log.info("Using CUDA device")
+            return "cuda:0"
+        except RuntimeError as e:
+            log.warning(f"CUDA available but error occurred: {e}. Falling back to CPU.")
+    log.info("Using CPU device")
+    return "cpu"
+
+
 class ModelManager:
     """Loads and holds all models. Initialized once at server startup."""
 
@@ -37,28 +51,30 @@ class ModelManager:
         log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
         from qwen_asr import Qwen3ASRModel
 
+        device = get_device()
         asr_model = Qwen3ASRModel.from_pretrained(
             "Qwen/Qwen3-ASR-0.6B",
             dtype=torch.bfloat16,
-            device_map="cuda:0",
+            device_map=device,
             max_new_tokens=4096,
         )
         self.asr_engine = ASREngine(asr_model)
         log.info("Qwen3-ASR-0.6B loaded.")
 
     def _load_llm(self):
-        log.info("Loading Qwen3-0.6B-Instruct...")
+        log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        model_name = "Qwen/Qwen3-0.6B"
+        model_name = "Qwen/Qwen3.5-0.8B"
+        
         tokenizer = AutoTokenizer.from_pretrained(model_name)
+        device = get_device()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="cuda:0",
+            device_map=device,
         )
         self.llm_engine = LLMEngine(model, tokenizer)
-        log.info("Qwen3-0.6B-Instruct loaded.")
+        log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).")
 
     def _load_tts(self):
         log.info("Loading Kokoro TTS...")