Enhance video handling and performance optimizations

- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
2026-04-24 00:36:18 -04:00
parent 129df7d1fa
commit 44a10667c2
7 changed files with 234 additions and 69 deletions
@@ -238,36 +238,63 @@ class ConversationSession:

        tts_thread.join(timeout=2.0)

-        # Video mode: render the speaking clip now that TTS is done.
+        # Video mode: stream speaking clips as they're generated (one per audio segment).
        if use_video and audio_buffer and not self.cancel_event.is_set():
            try:
                full_audio = np.concatenate(audio_buffer).astype(np.float32)
                sample_rate = getattr(self.models.tts_engine, "sample_rate", 24000)
                log.info(
-                    "Video: rendering speaking clip (audio=%ds, mode=%s)",
-                    int(len(full_audio) / sample_rate), video_engine.cfg.mode,
+                    "Video: rendering speaking clips (audio=%.1fs, mode=%s)",
+                    len(full_audio) / sample_rate, video_engine.cfg.mode,
                )
-                mp4_bytes = await asyncio.to_thread(
-                    video_engine.generate_speaking_clip,
-                    full_audio,
-                    sample_rate,
-                    response,
-                )
-                if self.cancel_event.is_set():
-                    log.info("Video clip discarded (cancelled during render).")
-                else:
-                    duration_ms = int(len(full_audio) / sample_rate * 1000)
-                    await self.send_json({
-                        "type": "speaking_clip",
-                        "chunk_id": 0,
-                        "duration_ms": duration_ms,
-                        "text": response,
-                        "size_bytes": len(mp4_bytes),
-                    })
-                    await self.send_bytes(mp4_bytes)
+
+                clip_queue: queue.Queue = queue.Queue()
+
+                def _video_worker():
+                    try:
+                        for clip_data in video_engine.generate_speaking_clips_streaming(
+                            full_audio, sample_rate, response
+                        ):
+                            if self.cancel_event.is_set():
+                                break
+                            clip_queue.put(clip_data)
+                    except Exception:
+                        log.exception("Video clip generation failed")
+                    finally:
+                        clip_queue.put(_SENTINEL)
+
+                video_thread = threading.Thread(target=_video_worker, daemon=True)
+                video_thread.start()
+
+                is_first_clip = True
+                while not self.cancel_event.is_set():
+                    try:
+                        item = await asyncio.to_thread(clip_queue.get, timeout=120.0)
+                    except Exception:
+                        log.warning("Timed out waiting for video clip.")
+                        break
+                    if item is _SENTINEL:
+                        break
+                    if self.cancel_event.is_set():
+                        break
+                    mp4_bytes, duration_ms = item
+                    try:
+                        await self.send_json({
+                            "type": "speaking_clip",
+                            "chunk_id": 0,
+                            "duration_ms": duration_ms,
+                            "text": response if is_first_clip else "",
+                            "size_bytes": len(mp4_bytes),
+                        })
+                        await self.send_bytes(mp4_bytes)
+                        is_first_clip = False
+                    except Exception:
+                        log.warning("Failed to send video clip, client disconnected.")
+                        self.cancel_event.set()
+                        break
+
            except Exception:
                log.exception("Video speaking-clip render failed; falling back silently.")
-                # Best-effort: tell the client nothing was spoken visually.
                try:
                    await self.send_json({
                        "type": "response_text",