"""ffmpeg-based frame + audio → MP4 muxing. Uses the system ``ffmpeg`` binary already installed in the Dockerfile. No extra python dependencies beyond ``numpy``. """ from __future__ import annotations import logging import os import shutil import subprocess import tempfile import numpy as np log = logging.getLogger(__name__) def _ffmpeg_bin() -> str: bin_path = shutil.which("ffmpeg") if bin_path is None: raise RuntimeError( "ffmpeg binary not found on PATH. It should be installed by " "the Dockerfile (line 13). Ensure you're running inside the " "docker image or install ffmpeg locally." ) return bin_path def _write_raw_frames(frames: np.ndarray, path: str) -> tuple[int, int]: """Write uint8 RGB frames to ``path`` as raw rgb24 bytes. Returns (h, w).""" if frames.ndim != 4 or frames.shape[-1] != 3: raise ValueError( f"frames must be [T, H, W, 3] uint8, got {frames.shape}" ) if frames.dtype != np.uint8: frames = frames.astype(np.uint8) with open(path, "wb") as f: f.write(frames.tobytes()) _, h, w, _ = frames.shape return h, w def _write_wav(audio: np.ndarray, sample_rate: int, path: str) -> None: """Write a float32 mono audio array to a 16-bit PCM WAV at ``path``.""" from scipy.io import wavfile # type: ignore[import-not-found] audio = np.asarray(audio, dtype=np.float32).reshape(-1) int16 = np.clip(audio * 32767.0, -32768, 32767).astype(np.int16) wavfile.write(path, sample_rate, int16) def frames_to_mp4_loop(frames: np.ndarray, fps: int) -> bytes: """Encode ``frames`` to a silent MP4 suitable for looping playback. Used for the idle clip: no audio track, loopable on an HTMLMediaElement without audible seams. """ if frames.size == 0: raise ValueError("frames_to_mp4_loop: empty frames") ffmpeg = _ffmpeg_bin() with tempfile.TemporaryDirectory() as td: raw_path = os.path.join(td, "frames.raw") out_path = os.path.join(td, "out.mp4") h, w = _write_raw_frames(frames, raw_path) cmd = [ ffmpeg, "-y", "-f", "rawvideo", "-pix_fmt", "rgb24", "-s", f"{w}x{h}", "-r", str(fps), "-i", raw_path, "-an", "-c:v", "libx264", "-preset", "veryfast", "-pix_fmt", "yuv420p", "-movflags", "+faststart", out_path, ] log.debug("muxer idle clip: %s", " ".join(cmd)) _run_ffmpeg(cmd) with open(out_path, "rb") as f: return f.read() def frames_and_audio_to_mp4( frames: np.ndarray, audio: np.ndarray, sample_rate: int, fps: int, ) -> bytes: """Encode ``frames`` + ``audio`` to an MP4 with H.264 video + AAC audio. Used for per-turn speaking clips. """ if frames.size == 0: raise ValueError("frames_and_audio_to_mp4: empty frames") if audio.size == 0: raise ValueError("frames_and_audio_to_mp4: empty audio") ffmpeg = _ffmpeg_bin() with tempfile.TemporaryDirectory() as td: raw_path = os.path.join(td, "frames.raw") wav_path = os.path.join(td, "audio.wav") out_path = os.path.join(td, "out.mp4") h, w = _write_raw_frames(frames, raw_path) _write_wav(audio, sample_rate, wav_path) cmd = [ ffmpeg, "-y", "-f", "rawvideo", "-pix_fmt", "rgb24", "-s", f"{w}x{h}", "-r", str(fps), "-i", raw_path, "-i", wav_path, "-c:v", "libx264", "-preset", "veryfast", "-pix_fmt", "yuv420p", "-c:a", "aac", "-b:a", "128k", "-shortest", "-movflags", "+faststart", out_path, ] log.debug("muxer speaking clip: %s", " ".join(cmd)) _run_ffmpeg(cmd) with open(out_path, "rb") as f: return f.read() def _run_ffmpeg(cmd: list[str]) -> None: try: proc = subprocess.run( cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) except subprocess.CalledProcessError as e: log.error("ffmpeg failed (exit %d): %s", e.returncode, e.stderr.decode(errors="replace")) raise if proc.returncode != 0: # pragma: no cover raise RuntimeError(f"ffmpeg returned {proc.returncode}")