first stab at adding video

2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
@@ -157,11 +157,20 @@ class ConversationSession:

        # TTS - stream chunks with per-sentence text
        await self.send_json({"type": "status", "state": "speaking"})
+
+        # Video-mode branch: if a video engine is loaded AND an avatar is
+        # set, buffer the full TTS output into a single blob, run MuseTalk
+        # lip-sync (library or reflective source), mux to MP4, and send the
+        # full clip + text in one shot. The client plays the MP4 (which
+        # carries audio) instead of the per-chunk PCM path.
+        video_engine = getattr(self.models, "video_engine", None)
+        use_video = video_engine is not None and video_engine.is_ready()
+
        chunk_queue = queue.Queue()
        self._last_played_chunk_id = None

        segments = _split_into_segments(response)
-        log.info(f"TTS: split response into {len(segments)} segments")
+        log.info(f"TTS: split response into {len(segments)} segments (video={use_video})")

        def _tts_worker():
            try:
@@ -187,6 +196,10 @@ class ConversationSession:
        chunk_id = 0
        # Maps chunk_id -> cumulative text up to and including that chunk
        chunk_text_map: dict[int, str] = {}
+        # Video mode accumulator: we buffer all TTS audio into one float32
+        # array so MuseTalk can align against the full utterance.
+        audio_buffer: list[np.ndarray] = []
+
        while True:
            try:
                item = await asyncio.to_thread(chunk_queue.get, timeout=10.0)
@@ -202,23 +215,69 @@ class ConversationSession:
            spoken_text += sentence_text
            chunk_text_map[chunk_id] = spoken_text

-            await self.send_json({
-                "type": "response_text",
-                "text": sentence_text,
-                "chunk_id": chunk_id,
-                "final": False,
-            })
-            pcm_bytes = float32_to_pcm_bytes(audio)
-            try:
-                await self.send_bytes(pcm_bytes)
-            except Exception:
-                log.warning("Failed to send audio, client disconnected.")
-                self.cancel_event.set()
-                break
+            if use_video:
+                audio_buffer.append(audio)
+                # Don't stream text or PCM during video mode — we'll send
+                # everything after the clip renders so the client doesn't
+                # start displaying text before the video is ready.
+            else:
+                await self.send_json({
+                    "type": "response_text",
+                    "text": sentence_text,
+                    "chunk_id": chunk_id,
+                    "final": False,
+                })
+                pcm_bytes = float32_to_pcm_bytes(audio)
+                try:
+                    await self.send_bytes(pcm_bytes)
+                except Exception:
+                    log.warning("Failed to send audio, client disconnected.")
+                    self.cancel_event.set()
+                    break
            chunk_id += 1

        tts_thread.join(timeout=2.0)

+        # Video mode: render the speaking clip now that TTS is done.
+        if use_video and audio_buffer and not self.cancel_event.is_set():
+            try:
+                full_audio = np.concatenate(audio_buffer).astype(np.float32)
+                sample_rate = getattr(self.models.tts_engine, "sample_rate", 24000)
+                log.info(
+                    "Video: rendering speaking clip (audio=%ds, mode=%s)",
+                    int(len(full_audio) / sample_rate), video_engine.cfg.mode,
+                )
+                mp4_bytes = await asyncio.to_thread(
+                    video_engine.generate_speaking_clip,
+                    full_audio,
+                    sample_rate,
+                    response,
+                )
+                if self.cancel_event.is_set():
+                    log.info("Video clip discarded (cancelled during render).")
+                else:
+                    duration_ms = int(len(full_audio) / sample_rate * 1000)
+                    await self.send_json({
+                        "type": "speaking_clip",
+                        "chunk_id": 0,
+                        "duration_ms": duration_ms,
+                        "text": response,
+                        "size_bytes": len(mp4_bytes),
+                    })
+                    await self.send_bytes(mp4_bytes)
+            except Exception:
+                log.exception("Video speaking-clip render failed; falling back silently.")
+                # Best-effort: tell the client nothing was spoken visually.
+                try:
+                    await self.send_json({
+                        "type": "response_text",
+                        "text": response,
+                        "chunk_id": 0,
+                        "final": True,
+                    })
+                except Exception:
+                    pass
+
        # Determine what was actually heard by the client
        was_interrupted = spoken_text.strip() != response.strip()
        if was_interrupted and self._last_played_chunk_id is not None: