Enhance video handling and performance optimizations
- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
This commit is contained in:
+79
-9
@@ -11,6 +11,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Literal
|
||||
|
||||
@@ -287,9 +288,12 @@ class VideoEngine:
|
||||
- Library mode: also pre-generate ``library.base_clip_count``
|
||||
speaking base clips.
|
||||
- Reflective mode: idle loop only.
|
||||
|
||||
Lazily calls load_models() on first invocation so that Wan2.2's VRAM
|
||||
footprint doesn't exist until video is actually used.
|
||||
"""
|
||||
if self._wan22 is None:
|
||||
raise RuntimeError("set_avatar called before load_models()")
|
||||
self.load_models()
|
||||
|
||||
with self._lock:
|
||||
log.info("Setting avatar: %s", image_path)
|
||||
@@ -383,8 +387,11 @@ class VideoEngine:
|
||||
def _pick_library_frames(
|
||||
self, audio_f32: np.ndarray, sample_rate: int
|
||||
) -> np.ndarray:
|
||||
"""Round-robin pick from the pre-baked library, clipped or looped
|
||||
to roughly the audio's duration so there's no long freeze frame."""
|
||||
"""Round-robin pick from the pre-baked library, clipped to the segment duration.
|
||||
|
||||
Does not loop frames — callers that need longer coverage should split
|
||||
the audio into segments and call this once per segment.
|
||||
"""
|
||||
if not self.speaking_base_frames:
|
||||
raise RuntimeError(
|
||||
"Library mode has no pre-baked base clips. "
|
||||
@@ -398,12 +405,75 @@ class VideoEngine:
|
||||
target_frames = int(round(len(audio_f32) / sample_rate * self.cfg.fps))
|
||||
if target_frames <= 0:
|
||||
return frames
|
||||
if target_frames <= len(frames):
|
||||
return frames[:target_frames]
|
||||
# Loop (with a mirror tail to soften the seam) to cover longer audio.
|
||||
loops = target_frames // len(frames) + 1
|
||||
extended = np.concatenate([frames] * loops, axis=0)
|
||||
return extended[:target_frames]
|
||||
return frames[:min(target_frames, len(frames))]
|
||||
|
||||
def generate_speaking_clips_streaming(
|
||||
self,
|
||||
audio_f32: np.ndarray,
|
||||
sample_rate: int,
|
||||
reply_text: str,
|
||||
) -> Iterator[tuple[bytes, int]]:
|
||||
"""Generate one MP4 per clip-length audio segment, yielding each when ready.
|
||||
|
||||
Splits ``audio_f32`` into segments of ``reflective_clip_seconds`` (or
|
||||
``library_base_clip_seconds`` for library mode) and generates + lip-syncs
|
||||
one clip per segment. Yields ``(mp4_bytes, duration_ms)`` tuples so the
|
||||
caller can stream each clip to the client as soon as it's ready rather
|
||||
than waiting for the full response.
|
||||
"""
|
||||
if not self.is_ready():
|
||||
raise RuntimeError(
|
||||
"generate_speaking_clips_streaming: engine not ready "
|
||||
"(avatar set? models loaded?)"
|
||||
)
|
||||
assert self._wan22 is not None
|
||||
|
||||
if len(audio_f32) == 0:
|
||||
return
|
||||
|
||||
clip_sec = (
|
||||
self.cfg.library_base_clip_seconds
|
||||
if self.cfg.mode == "library"
|
||||
else self.cfg.reflective_clip_seconds
|
||||
)
|
||||
clip_samples = int(clip_sec * sample_rate)
|
||||
segments = [
|
||||
audio_f32[i : i + clip_samples]
|
||||
for i in range(0, len(audio_f32), clip_samples)
|
||||
]
|
||||
|
||||
for seg_audio in segments:
|
||||
if self.cfg.mode == "library":
|
||||
base_frames = self._pick_library_frames(seg_audio, sample_rate)
|
||||
else:
|
||||
prompt = self._derive_prompt(reply_text)
|
||||
log.info("Reflective prompt (clip segment): %s", prompt[:80])
|
||||
base_frames = self._wan22.generate_i2v(
|
||||
image_path=self.avatar_path or "",
|
||||
prompt=prompt,
|
||||
seconds=self.cfg.reflective_clip_seconds,
|
||||
seed=None,
|
||||
)
|
||||
|
||||
if self._musetalk is not None:
|
||||
synced_frames = self._musetalk.lip_sync(
|
||||
frames=base_frames,
|
||||
audio=seg_audio,
|
||||
sample_rate=sample_rate,
|
||||
fps=self.cfg.fps,
|
||||
)
|
||||
else:
|
||||
synced_frames = base_frames
|
||||
|
||||
from server.video_models.muxer import frames_and_audio_to_mp4
|
||||
mp4_bytes = frames_and_audio_to_mp4(
|
||||
frames=synced_frames,
|
||||
audio=seg_audio,
|
||||
sample_rate=sample_rate,
|
||||
fps=self.cfg.fps,
|
||||
)
|
||||
duration_ms = int(len(seg_audio) / sample_rate * 1000)
|
||||
yield mp4_bytes, duration_ms
|
||||
|
||||
def _derive_prompt(self, reply_text: str) -> str:
|
||||
"""Template-based prompt builder for reflective mode.
|
||||
|
||||
Reference in New Issue
Block a user