147 lines
4.4 KiB
Python
147 lines
4.4 KiB
Python
"""ffmpeg-based frame + audio → MP4 muxing.
|
|
|
|
Uses the system ``ffmpeg`` binary already installed in the Dockerfile.
|
|
No extra python dependencies beyond ``numpy``.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
|
|
import numpy as np
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def _ffmpeg_bin() -> str:
|
|
bin_path = shutil.which("ffmpeg")
|
|
if bin_path is None:
|
|
raise RuntimeError(
|
|
"ffmpeg binary not found on PATH. It should be installed by "
|
|
"the Dockerfile (line 13). Ensure you're running inside the "
|
|
"docker image or install ffmpeg locally."
|
|
)
|
|
return bin_path
|
|
|
|
|
|
def _write_raw_frames(frames: np.ndarray, path: str) -> tuple[int, int]:
|
|
"""Write uint8 RGB frames to ``path`` as raw rgb24 bytes. Returns (h, w)."""
|
|
if frames.ndim != 4 or frames.shape[-1] != 3:
|
|
raise ValueError(
|
|
f"frames must be [T, H, W, 3] uint8, got {frames.shape}"
|
|
)
|
|
if frames.dtype != np.uint8:
|
|
frames = frames.astype(np.uint8)
|
|
with open(path, "wb") as f:
|
|
f.write(frames.tobytes())
|
|
_, h, w, _ = frames.shape
|
|
return h, w
|
|
|
|
|
|
def _write_wav(audio: np.ndarray, sample_rate: int, path: str) -> None:
|
|
"""Write a float32 mono audio array to a 16-bit PCM WAV at ``path``."""
|
|
from scipy.io import wavfile # type: ignore[import-not-found]
|
|
audio = np.asarray(audio, dtype=np.float32).reshape(-1)
|
|
int16 = np.clip(audio * 32767.0, -32768, 32767).astype(np.int16)
|
|
wavfile.write(path, sample_rate, int16)
|
|
|
|
|
|
def frames_to_mp4_loop(frames: np.ndarray, fps: int) -> bytes:
|
|
"""Encode ``frames`` to a silent MP4 suitable for looping playback.
|
|
|
|
Used for the idle clip: no audio track, loopable on an HTMLMediaElement
|
|
without audible seams.
|
|
"""
|
|
if frames.size == 0:
|
|
raise ValueError("frames_to_mp4_loop: empty frames")
|
|
|
|
ffmpeg = _ffmpeg_bin()
|
|
with tempfile.TemporaryDirectory() as td:
|
|
raw_path = os.path.join(td, "frames.raw")
|
|
out_path = os.path.join(td, "out.mp4")
|
|
h, w = _write_raw_frames(frames, raw_path)
|
|
|
|
cmd = [
|
|
ffmpeg, "-y",
|
|
"-f", "rawvideo",
|
|
"-pix_fmt", "rgb24",
|
|
"-s", f"{w}x{h}",
|
|
"-r", str(fps),
|
|
"-i", raw_path,
|
|
"-an",
|
|
"-c:v", "libx264",
|
|
"-preset", "veryfast",
|
|
"-pix_fmt", "yuv420p",
|
|
"-movflags", "+faststart",
|
|
out_path,
|
|
]
|
|
log.debug("muxer idle clip: %s", " ".join(cmd))
|
|
_run_ffmpeg(cmd)
|
|
with open(out_path, "rb") as f:
|
|
return f.read()
|
|
|
|
|
|
def frames_and_audio_to_mp4(
|
|
frames: np.ndarray,
|
|
audio: np.ndarray,
|
|
sample_rate: int,
|
|
fps: int,
|
|
) -> bytes:
|
|
"""Encode ``frames`` + ``audio`` to an MP4 with H.264 video + AAC audio.
|
|
|
|
Used for per-turn speaking clips.
|
|
"""
|
|
if frames.size == 0:
|
|
raise ValueError("frames_and_audio_to_mp4: empty frames")
|
|
if audio.size == 0:
|
|
raise ValueError("frames_and_audio_to_mp4: empty audio")
|
|
|
|
ffmpeg = _ffmpeg_bin()
|
|
with tempfile.TemporaryDirectory() as td:
|
|
raw_path = os.path.join(td, "frames.raw")
|
|
wav_path = os.path.join(td, "audio.wav")
|
|
out_path = os.path.join(td, "out.mp4")
|
|
|
|
h, w = _write_raw_frames(frames, raw_path)
|
|
_write_wav(audio, sample_rate, wav_path)
|
|
|
|
cmd = [
|
|
ffmpeg, "-y",
|
|
"-f", "rawvideo",
|
|
"-pix_fmt", "rgb24",
|
|
"-s", f"{w}x{h}",
|
|
"-r", str(fps),
|
|
"-i", raw_path,
|
|
"-i", wav_path,
|
|
"-c:v", "libx264",
|
|
"-preset", "veryfast",
|
|
"-pix_fmt", "yuv420p",
|
|
"-c:a", "aac",
|
|
"-b:a", "128k",
|
|
"-shortest",
|
|
"-movflags", "+faststart",
|
|
out_path,
|
|
]
|
|
log.debug("muxer speaking clip: %s", " ".join(cmd))
|
|
_run_ffmpeg(cmd)
|
|
with open(out_path, "rb") as f:
|
|
return f.read()
|
|
|
|
|
|
def _run_ffmpeg(cmd: list[str]) -> None:
|
|
try:
|
|
proc = subprocess.run(
|
|
cmd,
|
|
check=True,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
)
|
|
except subprocess.CalledProcessError as e:
|
|
log.error("ffmpeg failed (exit %d): %s", e.returncode, e.stderr.decode(errors="replace"))
|
|
raise
|
|
if proc.returncode != 0: # pragma: no cover
|
|
raise RuntimeError(f"ffmpeg returned {proc.returncode}")
|