Enhance video handling and performance optimizations

- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
2026-04-24 00:36:18 -04:00
parent 129df7d1fa
commit 44a10667c2
7 changed files with 234 additions and 69 deletions
@@ -7,6 +7,12 @@ ENV HF_HOME=/cache/huggingface
 # LoRA directory — users drop .safetensors files here and reference them
 # from config.yml::video.loras. Bind-mounted via docker-compose.
 ENV LORA_DIR=/cache/loras
+# Prevent PyTorch/OpenMP/MKL thread pools from spin-waiting when idle.
+# Without this, loading large models (ASR, LLM, Wan2.2) causes all CPU cores
+# to busy-loop even with no connected clients, slowing the whole system.
+ENV OMP_WAIT_POLICY=PASSIVE
+ENV MKL_WAIT_POLICY=PASSIVE
+ENV TOKENIZERS_PARALLELISM=false

 RUN apt-get update && apt-get install -y \
    python3.11 \
@@ -50,8 +56,11 @@ RUN python3.11 -m spacy download en_core_web_sm
 # LightX2V (Wan2.2-Lightning inference framework) — installed from source
 # since there is no stable PyPI release yet.
 RUN python3.11 -m pip install --no-cache-dir \
-    "git+https://github.com/ModelTC/LightX2V.git" || \
+    "git+https://github.com/ModelTC/LightX2V.git@6db002f2755036b02bd0900bf9b41958bbfb4137" || \
    echo "LightX2V install failed — config.video.enabled must stay false until fixed"
+# ^ Pinned to 2026-04-14: last commit before WorldMirrorRunner was added to
+# pipeline.py (which requires flash_attn + matplotlib) and before the
+# dummy_model NameError regression in vae_2_2.py.
 #
 # sgl-kernel (fp8 T5 encoder acceleration). The PyPI wheel lacks SM120
 # (Blackwell) CUTLASS kernels; use SGLang's cu128 wheel index instead.