first stab at adding video
This commit is contained in:
+73
-14
@@ -157,11 +157,20 @@ class ConversationSession:
|
||||
|
||||
# TTS - stream chunks with per-sentence text
|
||||
await self.send_json({"type": "status", "state": "speaking"})
|
||||
|
||||
# Video-mode branch: if a video engine is loaded AND an avatar is
|
||||
# set, buffer the full TTS output into a single blob, run MuseTalk
|
||||
# lip-sync (library or reflective source), mux to MP4, and send the
|
||||
# full clip + text in one shot. The client plays the MP4 (which
|
||||
# carries audio) instead of the per-chunk PCM path.
|
||||
video_engine = getattr(self.models, "video_engine", None)
|
||||
use_video = video_engine is not None and video_engine.is_ready()
|
||||
|
||||
chunk_queue = queue.Queue()
|
||||
self._last_played_chunk_id = None
|
||||
|
||||
segments = _split_into_segments(response)
|
||||
log.info(f"TTS: split response into {len(segments)} segments")
|
||||
log.info(f"TTS: split response into {len(segments)} segments (video={use_video})")
|
||||
|
||||
def _tts_worker():
|
||||
try:
|
||||
@@ -187,6 +196,10 @@ class ConversationSession:
|
||||
chunk_id = 0
|
||||
# Maps chunk_id -> cumulative text up to and including that chunk
|
||||
chunk_text_map: dict[int, str] = {}
|
||||
# Video mode accumulator: we buffer all TTS audio into one float32
|
||||
# array so MuseTalk can align against the full utterance.
|
||||
audio_buffer: list[np.ndarray] = []
|
||||
|
||||
while True:
|
||||
try:
|
||||
item = await asyncio.to_thread(chunk_queue.get, timeout=10.0)
|
||||
@@ -202,23 +215,69 @@ class ConversationSession:
|
||||
spoken_text += sentence_text
|
||||
chunk_text_map[chunk_id] = spoken_text
|
||||
|
||||
await self.send_json({
|
||||
"type": "response_text",
|
||||
"text": sentence_text,
|
||||
"chunk_id": chunk_id,
|
||||
"final": False,
|
||||
})
|
||||
pcm_bytes = float32_to_pcm_bytes(audio)
|
||||
try:
|
||||
await self.send_bytes(pcm_bytes)
|
||||
except Exception:
|
||||
log.warning("Failed to send audio, client disconnected.")
|
||||
self.cancel_event.set()
|
||||
break
|
||||
if use_video:
|
||||
audio_buffer.append(audio)
|
||||
# Don't stream text or PCM during video mode — we'll send
|
||||
# everything after the clip renders so the client doesn't
|
||||
# start displaying text before the video is ready.
|
||||
else:
|
||||
await self.send_json({
|
||||
"type": "response_text",
|
||||
"text": sentence_text,
|
||||
"chunk_id": chunk_id,
|
||||
"final": False,
|
||||
})
|
||||
pcm_bytes = float32_to_pcm_bytes(audio)
|
||||
try:
|
||||
await self.send_bytes(pcm_bytes)
|
||||
except Exception:
|
||||
log.warning("Failed to send audio, client disconnected.")
|
||||
self.cancel_event.set()
|
||||
break
|
||||
chunk_id += 1
|
||||
|
||||
tts_thread.join(timeout=2.0)
|
||||
|
||||
# Video mode: render the speaking clip now that TTS is done.
|
||||
if use_video and audio_buffer and not self.cancel_event.is_set():
|
||||
try:
|
||||
full_audio = np.concatenate(audio_buffer).astype(np.float32)
|
||||
sample_rate = getattr(self.models.tts_engine, "sample_rate", 24000)
|
||||
log.info(
|
||||
"Video: rendering speaking clip (audio=%ds, mode=%s)",
|
||||
int(len(full_audio) / sample_rate), video_engine.cfg.mode,
|
||||
)
|
||||
mp4_bytes = await asyncio.to_thread(
|
||||
video_engine.generate_speaking_clip,
|
||||
full_audio,
|
||||
sample_rate,
|
||||
response,
|
||||
)
|
||||
if self.cancel_event.is_set():
|
||||
log.info("Video clip discarded (cancelled during render).")
|
||||
else:
|
||||
duration_ms = int(len(full_audio) / sample_rate * 1000)
|
||||
await self.send_json({
|
||||
"type": "speaking_clip",
|
||||
"chunk_id": 0,
|
||||
"duration_ms": duration_ms,
|
||||
"text": response,
|
||||
"size_bytes": len(mp4_bytes),
|
||||
})
|
||||
await self.send_bytes(mp4_bytes)
|
||||
except Exception:
|
||||
log.exception("Video speaking-clip render failed; falling back silently.")
|
||||
# Best-effort: tell the client nothing was spoken visually.
|
||||
try:
|
||||
await self.send_json({
|
||||
"type": "response_text",
|
||||
"text": response,
|
||||
"chunk_id": 0,
|
||||
"final": True,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Determine what was actually heard by the client
|
||||
was_interrupted = spoken_text.strip() != response.strip()
|
||||
if was_interrupted and self._last_played_chunk_id is not None:
|
||||
|
||||
Reference in New Issue
Block a user