first stab at adding video

This commit is contained in:
2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
+73 -14
View File
@@ -157,11 +157,20 @@ class ConversationSession:
# TTS - stream chunks with per-sentence text
await self.send_json({"type": "status", "state": "speaking"})
# Video-mode branch: if a video engine is loaded AND an avatar is
# set, buffer the full TTS output into a single blob, run MuseTalk
# lip-sync (library or reflective source), mux to MP4, and send the
# full clip + text in one shot. The client plays the MP4 (which
# carries audio) instead of the per-chunk PCM path.
video_engine = getattr(self.models, "video_engine", None)
use_video = video_engine is not None and video_engine.is_ready()
chunk_queue = queue.Queue()
self._last_played_chunk_id = None
segments = _split_into_segments(response)
log.info(f"TTS: split response into {len(segments)} segments")
log.info(f"TTS: split response into {len(segments)} segments (video={use_video})")
def _tts_worker():
try:
@@ -187,6 +196,10 @@ class ConversationSession:
chunk_id = 0
# Maps chunk_id -> cumulative text up to and including that chunk
chunk_text_map: dict[int, str] = {}
# Video mode accumulator: we buffer all TTS audio into one float32
# array so MuseTalk can align against the full utterance.
audio_buffer: list[np.ndarray] = []
while True:
try:
item = await asyncio.to_thread(chunk_queue.get, timeout=10.0)
@@ -202,23 +215,69 @@ class ConversationSession:
spoken_text += sentence_text
chunk_text_map[chunk_id] = spoken_text
await self.send_json({
"type": "response_text",
"text": sentence_text,
"chunk_id": chunk_id,
"final": False,
})
pcm_bytes = float32_to_pcm_bytes(audio)
try:
await self.send_bytes(pcm_bytes)
except Exception:
log.warning("Failed to send audio, client disconnected.")
self.cancel_event.set()
break
if use_video:
audio_buffer.append(audio)
# Don't stream text or PCM during video mode — we'll send
# everything after the clip renders so the client doesn't
# start displaying text before the video is ready.
else:
await self.send_json({
"type": "response_text",
"text": sentence_text,
"chunk_id": chunk_id,
"final": False,
})
pcm_bytes = float32_to_pcm_bytes(audio)
try:
await self.send_bytes(pcm_bytes)
except Exception:
log.warning("Failed to send audio, client disconnected.")
self.cancel_event.set()
break
chunk_id += 1
tts_thread.join(timeout=2.0)
# Video mode: render the speaking clip now that TTS is done.
if use_video and audio_buffer and not self.cancel_event.is_set():
try:
full_audio = np.concatenate(audio_buffer).astype(np.float32)
sample_rate = getattr(self.models.tts_engine, "sample_rate", 24000)
log.info(
"Video: rendering speaking clip (audio=%ds, mode=%s)",
int(len(full_audio) / sample_rate), video_engine.cfg.mode,
)
mp4_bytes = await asyncio.to_thread(
video_engine.generate_speaking_clip,
full_audio,
sample_rate,
response,
)
if self.cancel_event.is_set():
log.info("Video clip discarded (cancelled during render).")
else:
duration_ms = int(len(full_audio) / sample_rate * 1000)
await self.send_json({
"type": "speaking_clip",
"chunk_id": 0,
"duration_ms": duration_ms,
"text": response,
"size_bytes": len(mp4_bytes),
})
await self.send_bytes(mp4_bytes)
except Exception:
log.exception("Video speaking-clip render failed; falling back silently.")
# Best-effort: tell the client nothing was spoken visually.
try:
await self.send_json({
"type": "response_text",
"text": response,
"chunk_id": 0,
"final": True,
})
except Exception:
pass
# Determine what was actually heard by the client
was_interrupted = spoken_text.strip() != response.strip()
if was_interrupted and self._last_played_chunk_id is not None: