first stab at adding video
This commit is contained in:
@@ -0,0 +1,146 @@
|
||||
"""ffmpeg-based frame + audio → MP4 muxing.
|
||||
|
||||
Uses the system ``ffmpeg`` binary already installed in the Dockerfile.
|
||||
No extra python dependencies beyond ``numpy``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ffmpeg_bin() -> str:
|
||||
bin_path = shutil.which("ffmpeg")
|
||||
if bin_path is None:
|
||||
raise RuntimeError(
|
||||
"ffmpeg binary not found on PATH. It should be installed by "
|
||||
"the Dockerfile (line 13). Ensure you're running inside the "
|
||||
"docker image or install ffmpeg locally."
|
||||
)
|
||||
return bin_path
|
||||
|
||||
|
||||
def _write_raw_frames(frames: np.ndarray, path: str) -> tuple[int, int]:
|
||||
"""Write uint8 RGB frames to ``path`` as raw rgb24 bytes. Returns (h, w)."""
|
||||
if frames.ndim != 4 or frames.shape[-1] != 3:
|
||||
raise ValueError(
|
||||
f"frames must be [T, H, W, 3] uint8, got {frames.shape}"
|
||||
)
|
||||
if frames.dtype != np.uint8:
|
||||
frames = frames.astype(np.uint8)
|
||||
with open(path, "wb") as f:
|
||||
f.write(frames.tobytes())
|
||||
_, h, w, _ = frames.shape
|
||||
return h, w
|
||||
|
||||
|
||||
def _write_wav(audio: np.ndarray, sample_rate: int, path: str) -> None:
|
||||
"""Write a float32 mono audio array to a 16-bit PCM WAV at ``path``."""
|
||||
from scipy.io import wavfile # type: ignore[import-not-found]
|
||||
audio = np.asarray(audio, dtype=np.float32).reshape(-1)
|
||||
int16 = np.clip(audio * 32767.0, -32768, 32767).astype(np.int16)
|
||||
wavfile.write(path, sample_rate, int16)
|
||||
|
||||
|
||||
def frames_to_mp4_loop(frames: np.ndarray, fps: int) -> bytes:
|
||||
"""Encode ``frames`` to a silent MP4 suitable for looping playback.
|
||||
|
||||
Used for the idle clip: no audio track, loopable on an HTMLMediaElement
|
||||
without audible seams.
|
||||
"""
|
||||
if frames.size == 0:
|
||||
raise ValueError("frames_to_mp4_loop: empty frames")
|
||||
|
||||
ffmpeg = _ffmpeg_bin()
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
raw_path = os.path.join(td, "frames.raw")
|
||||
out_path = os.path.join(td, "out.mp4")
|
||||
h, w = _write_raw_frames(frames, raw_path)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-f", "rawvideo",
|
||||
"-pix_fmt", "rgb24",
|
||||
"-s", f"{w}x{h}",
|
||||
"-r", str(fps),
|
||||
"-i", raw_path,
|
||||
"-an",
|
||||
"-c:v", "libx264",
|
||||
"-preset", "veryfast",
|
||||
"-pix_fmt", "yuv420p",
|
||||
"-movflags", "+faststart",
|
||||
out_path,
|
||||
]
|
||||
log.debug("muxer idle clip: %s", " ".join(cmd))
|
||||
_run_ffmpeg(cmd)
|
||||
with open(out_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def frames_and_audio_to_mp4(
|
||||
frames: np.ndarray,
|
||||
audio: np.ndarray,
|
||||
sample_rate: int,
|
||||
fps: int,
|
||||
) -> bytes:
|
||||
"""Encode ``frames`` + ``audio`` to an MP4 with H.264 video + AAC audio.
|
||||
|
||||
Used for per-turn speaking clips.
|
||||
"""
|
||||
if frames.size == 0:
|
||||
raise ValueError("frames_and_audio_to_mp4: empty frames")
|
||||
if audio.size == 0:
|
||||
raise ValueError("frames_and_audio_to_mp4: empty audio")
|
||||
|
||||
ffmpeg = _ffmpeg_bin()
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
raw_path = os.path.join(td, "frames.raw")
|
||||
wav_path = os.path.join(td, "audio.wav")
|
||||
out_path = os.path.join(td, "out.mp4")
|
||||
|
||||
h, w = _write_raw_frames(frames, raw_path)
|
||||
_write_wav(audio, sample_rate, wav_path)
|
||||
|
||||
cmd = [
|
||||
ffmpeg, "-y",
|
||||
"-f", "rawvideo",
|
||||
"-pix_fmt", "rgb24",
|
||||
"-s", f"{w}x{h}",
|
||||
"-r", str(fps),
|
||||
"-i", raw_path,
|
||||
"-i", wav_path,
|
||||
"-c:v", "libx264",
|
||||
"-preset", "veryfast",
|
||||
"-pix_fmt", "yuv420p",
|
||||
"-c:a", "aac",
|
||||
"-b:a", "128k",
|
||||
"-shortest",
|
||||
"-movflags", "+faststart",
|
||||
out_path,
|
||||
]
|
||||
log.debug("muxer speaking clip: %s", " ".join(cmd))
|
||||
_run_ffmpeg(cmd)
|
||||
with open(out_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def _run_ffmpeg(cmd: list[str]) -> None:
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
log.error("ffmpeg failed (exit %d): %s", e.returncode, e.stderr.decode(errors="replace"))
|
||||
raise
|
||||
if proc.returncode != 0: # pragma: no cover
|
||||
raise RuntimeError(f"ffmpeg returned {proc.returncode}")
|
||||
Reference in New Issue
Block a user