Enhance video handling and performance optimizations
- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
This commit is contained in:
+49
-22
@@ -238,36 +238,63 @@ class ConversationSession:
|
||||
|
||||
tts_thread.join(timeout=2.0)
|
||||
|
||||
# Video mode: render the speaking clip now that TTS is done.
|
||||
# Video mode: stream speaking clips as they're generated (one per audio segment).
|
||||
if use_video and audio_buffer and not self.cancel_event.is_set():
|
||||
try:
|
||||
full_audio = np.concatenate(audio_buffer).astype(np.float32)
|
||||
sample_rate = getattr(self.models.tts_engine, "sample_rate", 24000)
|
||||
log.info(
|
||||
"Video: rendering speaking clip (audio=%ds, mode=%s)",
|
||||
int(len(full_audio) / sample_rate), video_engine.cfg.mode,
|
||||
"Video: rendering speaking clips (audio=%.1fs, mode=%s)",
|
||||
len(full_audio) / sample_rate, video_engine.cfg.mode,
|
||||
)
|
||||
mp4_bytes = await asyncio.to_thread(
|
||||
video_engine.generate_speaking_clip,
|
||||
full_audio,
|
||||
sample_rate,
|
||||
response,
|
||||
)
|
||||
if self.cancel_event.is_set():
|
||||
log.info("Video clip discarded (cancelled during render).")
|
||||
else:
|
||||
duration_ms = int(len(full_audio) / sample_rate * 1000)
|
||||
await self.send_json({
|
||||
"type": "speaking_clip",
|
||||
"chunk_id": 0,
|
||||
"duration_ms": duration_ms,
|
||||
"text": response,
|
||||
"size_bytes": len(mp4_bytes),
|
||||
})
|
||||
await self.send_bytes(mp4_bytes)
|
||||
|
||||
clip_queue: queue.Queue = queue.Queue()
|
||||
|
||||
def _video_worker():
|
||||
try:
|
||||
for clip_data in video_engine.generate_speaking_clips_streaming(
|
||||
full_audio, sample_rate, response
|
||||
):
|
||||
if self.cancel_event.is_set():
|
||||
break
|
||||
clip_queue.put(clip_data)
|
||||
except Exception:
|
||||
log.exception("Video clip generation failed")
|
||||
finally:
|
||||
clip_queue.put(_SENTINEL)
|
||||
|
||||
video_thread = threading.Thread(target=_video_worker, daemon=True)
|
||||
video_thread.start()
|
||||
|
||||
is_first_clip = True
|
||||
while not self.cancel_event.is_set():
|
||||
try:
|
||||
item = await asyncio.to_thread(clip_queue.get, timeout=120.0)
|
||||
except Exception:
|
||||
log.warning("Timed out waiting for video clip.")
|
||||
break
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
if self.cancel_event.is_set():
|
||||
break
|
||||
mp4_bytes, duration_ms = item
|
||||
try:
|
||||
await self.send_json({
|
||||
"type": "speaking_clip",
|
||||
"chunk_id": 0,
|
||||
"duration_ms": duration_ms,
|
||||
"text": response if is_first_clip else "",
|
||||
"size_bytes": len(mp4_bytes),
|
||||
})
|
||||
await self.send_bytes(mp4_bytes)
|
||||
is_first_clip = False
|
||||
except Exception:
|
||||
log.warning("Failed to send video clip, client disconnected.")
|
||||
self.cancel_event.set()
|
||||
break
|
||||
|
||||
except Exception:
|
||||
log.exception("Video speaking-clip render failed; falling back silently.")
|
||||
# Best-effort: tell the client nothing was spoken visually.
|
||||
try:
|
||||
await self.send_json({
|
||||
"type": "response_text",
|
||||
|
||||
Reference in New Issue
Block a user