updates for docker take 1

This commit is contained in:
2026-04-08 03:05:26 -04:00
parent ce41bca422
commit 0305f1dccd
7 changed files with 108 additions and 7 deletions
+8
View File
@@ -0,0 +1,8 @@
.venv/
__pycache__/
*.pyc
*.pyo
.git/
.gitignore
*.md
.env
+1
View File
@@ -1,2 +1,3 @@
.venv .venv
.claude
__pycache__ __pycache__
View File
+53
View File
@@ -0,0 +1,53 @@
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
# HuggingFace model cache — mounted as a volume so models persist across runs
ENV HF_HOME=/cache/huggingface
RUN apt-get update && apt-get install -y \
python3.11 \
python3.11-dev \
python3.11-venv \
git \
ffmpeg \
curl \
cmake \
ninja-build \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
RUN ln -sf /usr/bin/python3.11 /usr/bin/python
WORKDIR /app
# Build PyTorch from source with Blackwell (sm_120) support
RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
cd /tmp/pytorch && \
git submodule update --init --recursive && \
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
python3.11 setup.py install && \
cd / && rm -rf /tmp/pytorch
# Install torchvision and torchaudio with CUDA 12.1 support
RUN python3.11 -m pip install --no-cache-dir \
torchvision torchaudio \
--index-url https://download.pytorch.org/whl/cu121
# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
RUN python3.11 -m pip install --no-cache-dir \
"auto-gptq>=0.7.1" \
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
# Install the rest of the app requirements
COPY requirements.txt .
RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["python3.11", "run.py"]
+18
View File
@@ -0,0 +1,18 @@
services:
voice-chat:
build: .
ports:
- "8000:8000"
volumes:
# Cache models on the host so they survive container rebuilds
- huggingface-cache:/cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
volumes:
huggingface-cache:
+6 -1
View File
@@ -1,6 +1,11 @@
torch>=2.5.0 # torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
transformers==4.57.6 transformers==4.57.6
optimum>=1.19
compressed-tensors>=0.5.0
silero-vad>=5.1 silero-vad>=5.1
qwen-asr==0.0.6
kokoro==0.9.4
fastapi>=0.115.0 fastapi>=0.115.0
uvicorn[standard]>=0.30.0 uvicorn[standard]>=0.30.0
numpy numpy
+22 -6
View File
@@ -9,6 +9,20 @@ from server.tts import TTSEngine
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def get_device():
"""Get the best available device (CUDA if available and working, otherwise CPU)."""
if torch.cuda.is_available():
try:
# Test CUDA availability
torch.zeros(1, device="cuda:0")
log.info("Using CUDA device")
return "cuda:0"
except RuntimeError as e:
log.warning(f"CUDA available but error occurred: {e}. Falling back to CPU.")
log.info("Using CPU device")
return "cpu"
class ModelManager: class ModelManager:
"""Loads and holds all models. Initialized once at server startup.""" """Loads and holds all models. Initialized once at server startup."""
@@ -37,28 +51,30 @@ class ModelManager:
log.info("Loading Qwen3-ASR-0.6B (transformers backend)...") log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
from qwen_asr import Qwen3ASRModel from qwen_asr import Qwen3ASRModel
device = get_device()
asr_model = Qwen3ASRModel.from_pretrained( asr_model = Qwen3ASRModel.from_pretrained(
"Qwen/Qwen3-ASR-0.6B", "Qwen/Qwen3-ASR-0.6B",
dtype=torch.bfloat16, dtype=torch.bfloat16,
device_map="cuda:0", device_map=device,
max_new_tokens=4096, max_new_tokens=4096,
) )
self.asr_engine = ASREngine(asr_model) self.asr_engine = ASREngine(asr_model)
log.info("Qwen3-ASR-0.6B loaded.") log.info("Qwen3-ASR-0.6B loaded.")
def _load_llm(self): def _load_llm(self):
log.info("Loading Qwen3-0.6B-Instruct...") log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "Qwen/Qwen3-0.6B" model_name = "Qwen/Qwen3.5-0.8B"
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
device = get_device()
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
model_name, model_name,
torch_dtype=torch.bfloat16, device_map=device,
device_map="cuda:0",
) )
self.llm_engine = LLMEngine(model, tokenizer) self.llm_engine = LLMEngine(model, tokenizer)
log.info("Qwen3-0.6B-Instruct loaded.") log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).")
def _load_tts(self): def _load_tts(self):
log.info("Loading Kokoro TTS...") log.info("Loading Kokoro TTS...")