Enhance video handling and performance optimizations

- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
2026-04-24 00:36:18 -04:00
parent 129df7d1fa
commit 44a10667c2
7 changed files with 234 additions and 69 deletions
@@ -11,6 +11,7 @@ from __future__ import annotations

 import logging
 import threading
+from collections.abc import Iterator
 from dataclasses import dataclass, field
 from typing import Literal

@@ -287,9 +288,12 @@ class VideoEngine:
        - Library mode: also pre-generate ``library.base_clip_count``
          speaking base clips.
        - Reflective mode: idle loop only.
+
+        Lazily calls load_models() on first invocation so that Wan2.2's VRAM
+        footprint doesn't exist until video is actually used.
        """
        if self._wan22 is None:
-            raise RuntimeError("set_avatar called before load_models()")
+            self.load_models()

        with self._lock:
            log.info("Setting avatar: %s", image_path)
@@ -383,8 +387,11 @@ class VideoEngine:
    def _pick_library_frames(
        self, audio_f32: np.ndarray, sample_rate: int
    ) -> np.ndarray:
-        """Round-robin pick from the pre-baked library, clipped or looped
-        to roughly the audio's duration so there's no long freeze frame."""
+        """Round-robin pick from the pre-baked library, clipped to the segment duration.
+
+        Does not loop frames — callers that need longer coverage should split
+        the audio into segments and call this once per segment.
+        """
        if not self.speaking_base_frames:
            raise RuntimeError(
                "Library mode has no pre-baked base clips. "
@@ -398,12 +405,75 @@ class VideoEngine:
        target_frames = int(round(len(audio_f32) / sample_rate * self.cfg.fps))
        if target_frames <= 0:
            return frames
-        if target_frames <= len(frames):
-            return frames[:target_frames]
-        # Loop (with a mirror tail to soften the seam) to cover longer audio.
-        loops = target_frames // len(frames) + 1
-        extended = np.concatenate([frames] * loops, axis=0)
-        return extended[:target_frames]
+        return frames[:min(target_frames, len(frames))]
+
+    def generate_speaking_clips_streaming(
+        self,
+        audio_f32: np.ndarray,
+        sample_rate: int,
+        reply_text: str,
+    ) -> Iterator[tuple[bytes, int]]:
+        """Generate one MP4 per clip-length audio segment, yielding each when ready.
+
+        Splits ``audio_f32`` into segments of ``reflective_clip_seconds`` (or
+        ``library_base_clip_seconds`` for library mode) and generates + lip-syncs
+        one clip per segment. Yields ``(mp4_bytes, duration_ms)`` tuples so the
+        caller can stream each clip to the client as soon as it's ready rather
+        than waiting for the full response.
+        """
+        if not self.is_ready():
+            raise RuntimeError(
+                "generate_speaking_clips_streaming: engine not ready "
+                "(avatar set? models loaded?)"
+            )
+        assert self._wan22 is not None
+
+        if len(audio_f32) == 0:
+            return
+
+        clip_sec = (
+            self.cfg.library_base_clip_seconds
+            if self.cfg.mode == "library"
+            else self.cfg.reflective_clip_seconds
+        )
+        clip_samples = int(clip_sec * sample_rate)
+        segments = [
+            audio_f32[i : i + clip_samples]
+            for i in range(0, len(audio_f32), clip_samples)
+        ]
+
+        for seg_audio in segments:
+            if self.cfg.mode == "library":
+                base_frames = self._pick_library_frames(seg_audio, sample_rate)
+            else:
+                prompt = self._derive_prompt(reply_text)
+                log.info("Reflective prompt (clip segment): %s", prompt[:80])
+                base_frames = self._wan22.generate_i2v(
+                    image_path=self.avatar_path or "",
+                    prompt=prompt,
+                    seconds=self.cfg.reflective_clip_seconds,
+                    seed=None,
+                )
+
+            if self._musetalk is not None:
+                synced_frames = self._musetalk.lip_sync(
+                    frames=base_frames,
+                    audio=seg_audio,
+                    sample_rate=sample_rate,
+                    fps=self.cfg.fps,
+                )
+            else:
+                synced_frames = base_frames
+
+            from server.video_models.muxer import frames_and_audio_to_mp4
+            mp4_bytes = frames_and_audio_to_mp4(
+                frames=synced_frames,
+                audio=seg_audio,
+                sample_rate=sample_rate,
+                fps=self.cfg.fps,
+            )
+            duration_ms = int(len(seg_audio) / sample_rate * 1000)
+            yield mp4_bytes, duration_ms

    def _derive_prompt(self, reply_text: str) -> str:
        """Template-based prompt builder for reflective mode.