Enhance video handling and performance optimizations

- Added environment variables to prevent CPU thread pools from busy-waiting.
- Deferred loading of video models until first use to reduce VRAM footprint.
- Implemented streaming of speaking clips for improved responsiveness.
- Introduced a queue for managing speaking clips to handle multiple requests smoothly.
- Updated video playback logic to ensure proper handling of clip generation.
This commit is contained in:
2026-04-24 00:36:18 -04:00
parent 129df7d1fa
commit 44a10667c2
7 changed files with 234 additions and 69 deletions
+49 -22
View File
@@ -238,36 +238,63 @@ class ConversationSession:
tts_thread.join(timeout=2.0)
# Video mode: render the speaking clip now that TTS is done.
# Video mode: stream speaking clips as they're generated (one per audio segment).
if use_video and audio_buffer and not self.cancel_event.is_set():
try:
full_audio = np.concatenate(audio_buffer).astype(np.float32)
sample_rate = getattr(self.models.tts_engine, "sample_rate", 24000)
log.info(
"Video: rendering speaking clip (audio=%ds, mode=%s)",
int(len(full_audio) / sample_rate), video_engine.cfg.mode,
"Video: rendering speaking clips (audio=%.1fs, mode=%s)",
len(full_audio) / sample_rate, video_engine.cfg.mode,
)
mp4_bytes = await asyncio.to_thread(
video_engine.generate_speaking_clip,
full_audio,
sample_rate,
response,
)
if self.cancel_event.is_set():
log.info("Video clip discarded (cancelled during render).")
else:
duration_ms = int(len(full_audio) / sample_rate * 1000)
await self.send_json({
"type": "speaking_clip",
"chunk_id": 0,
"duration_ms": duration_ms,
"text": response,
"size_bytes": len(mp4_bytes),
})
await self.send_bytes(mp4_bytes)
clip_queue: queue.Queue = queue.Queue()
def _video_worker():
try:
for clip_data in video_engine.generate_speaking_clips_streaming(
full_audio, sample_rate, response
):
if self.cancel_event.is_set():
break
clip_queue.put(clip_data)
except Exception:
log.exception("Video clip generation failed")
finally:
clip_queue.put(_SENTINEL)
video_thread = threading.Thread(target=_video_worker, daemon=True)
video_thread.start()
is_first_clip = True
while not self.cancel_event.is_set():
try:
item = await asyncio.to_thread(clip_queue.get, timeout=120.0)
except Exception:
log.warning("Timed out waiting for video clip.")
break
if item is _SENTINEL:
break
if self.cancel_event.is_set():
break
mp4_bytes, duration_ms = item
try:
await self.send_json({
"type": "speaking_clip",
"chunk_id": 0,
"duration_ms": duration_ms,
"text": response if is_first_clip else "",
"size_bytes": len(mp4_bytes),
})
await self.send_bytes(mp4_bytes)
is_first_clip = False
except Exception:
log.warning("Failed to send video clip, client disconnected.")
self.cancel_event.set()
break
except Exception:
log.exception("Video speaking-clip render failed; falling back silently.")
# Best-effort: tell the client nothing was spoken visually.
try:
await self.send_json({
"type": "response_text",