first stab at adding video

This commit is contained in:
2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
+146
View File
@@ -0,0 +1,146 @@
"""ffmpeg-based frame + audio → MP4 muxing.
Uses the system ``ffmpeg`` binary already installed in the Dockerfile.
No extra python dependencies beyond ``numpy``.
"""
from __future__ import annotations
import logging
import os
import shutil
import subprocess
import tempfile
import numpy as np
log = logging.getLogger(__name__)
def _ffmpeg_bin() -> str:
bin_path = shutil.which("ffmpeg")
if bin_path is None:
raise RuntimeError(
"ffmpeg binary not found on PATH. It should be installed by "
"the Dockerfile (line 13). Ensure you're running inside the "
"docker image or install ffmpeg locally."
)
return bin_path
def _write_raw_frames(frames: np.ndarray, path: str) -> tuple[int, int]:
"""Write uint8 RGB frames to ``path`` as raw rgb24 bytes. Returns (h, w)."""
if frames.ndim != 4 or frames.shape[-1] != 3:
raise ValueError(
f"frames must be [T, H, W, 3] uint8, got {frames.shape}"
)
if frames.dtype != np.uint8:
frames = frames.astype(np.uint8)
with open(path, "wb") as f:
f.write(frames.tobytes())
_, h, w, _ = frames.shape
return h, w
def _write_wav(audio: np.ndarray, sample_rate: int, path: str) -> None:
"""Write a float32 mono audio array to a 16-bit PCM WAV at ``path``."""
from scipy.io import wavfile # type: ignore[import-not-found]
audio = np.asarray(audio, dtype=np.float32).reshape(-1)
int16 = np.clip(audio * 32767.0, -32768, 32767).astype(np.int16)
wavfile.write(path, sample_rate, int16)
def frames_to_mp4_loop(frames: np.ndarray, fps: int) -> bytes:
"""Encode ``frames`` to a silent MP4 suitable for looping playback.
Used for the idle clip: no audio track, loopable on an HTMLMediaElement
without audible seams.
"""
if frames.size == 0:
raise ValueError("frames_to_mp4_loop: empty frames")
ffmpeg = _ffmpeg_bin()
with tempfile.TemporaryDirectory() as td:
raw_path = os.path.join(td, "frames.raw")
out_path = os.path.join(td, "out.mp4")
h, w = _write_raw_frames(frames, raw_path)
cmd = [
ffmpeg, "-y",
"-f", "rawvideo",
"-pix_fmt", "rgb24",
"-s", f"{w}x{h}",
"-r", str(fps),
"-i", raw_path,
"-an",
"-c:v", "libx264",
"-preset", "veryfast",
"-pix_fmt", "yuv420p",
"-movflags", "+faststart",
out_path,
]
log.debug("muxer idle clip: %s", " ".join(cmd))
_run_ffmpeg(cmd)
with open(out_path, "rb") as f:
return f.read()
def frames_and_audio_to_mp4(
frames: np.ndarray,
audio: np.ndarray,
sample_rate: int,
fps: int,
) -> bytes:
"""Encode ``frames`` + ``audio`` to an MP4 with H.264 video + AAC audio.
Used for per-turn speaking clips.
"""
if frames.size == 0:
raise ValueError("frames_and_audio_to_mp4: empty frames")
if audio.size == 0:
raise ValueError("frames_and_audio_to_mp4: empty audio")
ffmpeg = _ffmpeg_bin()
with tempfile.TemporaryDirectory() as td:
raw_path = os.path.join(td, "frames.raw")
wav_path = os.path.join(td, "audio.wav")
out_path = os.path.join(td, "out.mp4")
h, w = _write_raw_frames(frames, raw_path)
_write_wav(audio, sample_rate, wav_path)
cmd = [
ffmpeg, "-y",
"-f", "rawvideo",
"-pix_fmt", "rgb24",
"-s", f"{w}x{h}",
"-r", str(fps),
"-i", raw_path,
"-i", wav_path,
"-c:v", "libx264",
"-preset", "veryfast",
"-pix_fmt", "yuv420p",
"-c:a", "aac",
"-b:a", "128k",
"-shortest",
"-movflags", "+faststart",
out_path,
]
log.debug("muxer speaking clip: %s", " ".join(cmd))
_run_ffmpeg(cmd)
with open(out_path, "rb") as f:
return f.read()
def _run_ffmpeg(cmd: list[str]) -> None:
try:
proc = subprocess.run(
cmd,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except subprocess.CalledProcessError as e:
log.error("ffmpeg failed (exit %d): %s", e.returncode, e.stderr.decode(errors="replace"))
raise
if proc.returncode != 0: # pragma: no cover
raise RuntimeError(f"ffmpeg returned {proc.returncode}")