8 Commits

Author SHA1 Message Date
bhetherman ca5b7a8f28 Make musetalk a proper Python package
Add missing __init__.py files so `musetalk.*` subpackages import
without relying on namespace-package behavior or sys.path hacks, and
narrow the root `/models/` gitignore so `musetalk/models/__init__.py`
is tracked.

Also add a minimal pyproject.toml so the source can be installed with
`pip install .` (or `pip install -e .`) without pulling the pinned
requirements.txt, which consumers typically need to override.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 23:02:52 -04:00
李洋 0a89dec45a feat: update download_weights.bat (#372)
更换weights下载工具
更换face-parse-bisent源
2025-09-26 13:44:17 +08:00
Alexey 8c19579b1e fix: convert all audio to WAV 16kHz PCM before processing (#379) 2025-09-26 13:42:23 +08:00
yykani 9deb9bea0d fix: ensure upper bond does not go below zero in landmark extraction (#329) 2025-07-02 16:42:35 +08:00
Nick Davis 6e39bd0d00 fix: preprocess import bug (#345)
Fixing the issue when executing the script "python -m scripts.preprocess --config ./configs/training/preprocess.yaml" in the Ubuntu environment, due to the "import torch" order problem, the execution of "from musetalk.utils.face_detection import FaceAlignment, LandmarksType" hangs.
2025-07-02 16:40:49 +08:00
GaoLeiA 26ca7c2c03 fix: use torch.no_grad() in inference to prevent excessive memory usage (~30GB) with inference (#349) 2025-07-02 16:38:56 +08:00
Wei Lin Liu 8ca7d1884c fix: download_weights.sh (#318)
Fixed wrong mkdir syntax, wrong install location of face-parse-bisent and improve readability
2025-04-22 16:49:40 +08:00
Zhizhou Zhong 67e7ee3c73 feat: windows infer & gradio (#312)
* fix: windows infer

* docs: update readme

* docs: update readme

* feat: v1.5 gradio for windows&linux

* fix: dependencies

* feat: windows infer & gradio

---------

Co-authored-by: NeRF-Factory <zzhizhou66@gmail.com>
2025-04-12 23:22:22 +08:00
14 changed files with 83 additions and 36 deletions
+1 -1
View File
@@ -5,7 +5,7 @@
*.pyc *.pyc
.ipynb_checkpoints .ipynb_checkpoints
results/ results/
models/ /models/
**/__pycache__/ **/__pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class
+9 -13
View File
@@ -14,32 +14,28 @@ mkdir %CheckpointsDir%\sd-vae-ft-mse
mkdir %CheckpointsDir%\whisper mkdir %CheckpointsDir%\whisper
:: Install required packages :: Install required packages
pip install -U "huggingface_hub[cli]" pip install -U "huggingface_hub[hf_xet]"
pip install gdown
:: Set HuggingFace endpoint :: Set HuggingFace endpoint
set HF_ENDPOINT=https://hf-mirror.com set HF_ENDPOINT=https://hf-mirror.com
:: Download MuseTalk weights :: Download MuseTalk weights
huggingface-cli download TMElyralab/MuseTalk --local-dir %CheckpointsDir% hf download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
:: Download SD VAE weights :: Download SD VAE weights
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin" hf download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
:: Download Whisper weights :: Download Whisper weights
huggingface-cli download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json" hf download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
:: Download DWPose weights :: Download DWPose weights
huggingface-cli download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth" hf download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
:: Download SyncNet weights :: Download SyncNet weights
huggingface-cli download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt" hf download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
:: Download Face Parse Bisent weights (using gdown) :: Download face-parse-bisent weights
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O %CheckpointsDir%\face-parse-bisent\79999_iter.pth hf download ManyOtherFunctions/face-parse-bisent --local-dir %CheckpointsDir%\face-parse-bisent --include "79999_iter.pth" "resnet18-5c106cde.pth"
:: Download ResNet weights
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o %CheckpointsDir%\face-parse-bisent\resnet18-5c106cde.pth
echo All weights have been downloaded successfully! echo All weights have been downloaded successfully!
endlocal endlocal
+27 -13
View File
@@ -4,34 +4,48 @@
CheckpointsDir="models" CheckpointsDir="models"
# Create necessary directories # Create necessary directories
mkdir -p $CheckpointsDir/{musetalk,musetalkV15,syncnet,dwpose,face-parse-bisent,sd-vae-ft-mse,whisper} mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper
# Install required packages # Install required packages
pip install -U "huggingface_hub[cli]" pip install -U "huggingface_hub[cli]"
pip install gdown pip install gdown
# Set HuggingFace endpoint # Set HuggingFace mirror endpoint
export HF_ENDPOINT=https://hf-mirror.com export HF_ENDPOINT=https://hf-mirror.com
# Download MuseTalk weights # Download MuseTalk V1.0 weights
huggingface-cli download TMElyralab/MuseTalk --local-dir $CheckpointsDir huggingface-cli download TMElyralab/MuseTalk \
--local-dir $CheckpointsDir \
--include "musetalk/musetalk.json" "musetalk/pytorch_model.bin"
# Download MuseTalk V1.5 weights (unet.pth)
huggingface-cli download TMElyralab/MuseTalk \
--local-dir $CheckpointsDir \
--include "musetalkV15/musetalk.json" "musetalkV15/unet.pth"
# Download SD VAE weights # Download SD VAE weights
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir $CheckpointsDir/sd-vae --include "config.json" "diffusion_pytorch_model.bin" huggingface-cli download stabilityai/sd-vae-ft-mse \
--local-dir $CheckpointsDir/sd-vae \
--include "config.json" "diffusion_pytorch_model.bin"
# Download Whisper weights # Download Whisper weights
huggingface-cli download openai/whisper-tiny --local-dir $CheckpointsDir/whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json" huggingface-cli download openai/whisper-tiny \
--local-dir $CheckpointsDir/whisper \
--include "config.json" "pytorch_model.bin" "preprocessor_config.json"
# Download DWPose weights # Download DWPose weights
huggingface-cli download yzd-v/DWPose --local-dir $CheckpointsDir/dwpose --include "dw-ll_ucoco_384.pth" huggingface-cli download yzd-v/DWPose \
--local-dir $CheckpointsDir/dwpose \
--include "dw-ll_ucoco_384.pth"
# Download SyncNet weights # Download SyncNet weights
huggingface-cli download ByteDance/LatentSync --local-dir $CheckpointsDir/syncnet --include "latentsync_syncnet.pt" huggingface-cli download ByteDance/LatentSync \
--local-dir $CheckpointsDir/syncnet \
--include "latentsync_syncnet.pt"
# Download Face Parse Bisent weights (using gdown) # Download Face Parse Bisent weights
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \
-o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
# Download ResNet weights echo "✅ All weights have been downloaded successfully!"
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
echo "All weights have been downloaded successfully!"
View File
View File
+7 -4
View File
@@ -15,6 +15,7 @@ from decord.ndarray import cpu
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
from musetalk.data import audio from musetalk.data import audio
from musetalk.utils.audio_utils import ensure_wav
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
@@ -171,7 +172,8 @@ class FaceDataset(Dataset):
""" """
if not os.path.exists(wav_path): if not os.path.exists(wav_path):
return None return None
audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000) wav_path_converted = ensure_wav(wav_path)
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
assert sampling_rate == 16000 assert sampling_rate == 16000
while start_index >= 25 * 30: while start_index >= 25 * 30:
@@ -206,11 +208,12 @@ class FaceDataset(Dataset):
if not os.path.exists(wav_path): if not os.path.exists(wav_path):
return None return None
audio_input, sampling_rate = librosa.load(wav_path, sr=16000) wav_path_converted = ensure_wav(wav_path)
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
assert sampling_rate == 16000 assert sampling_rate == 16000
audio_input = self.mel_feature_extractor(audio_input) audio_mel = self.mel_feature_extractor(audio_input_librosa)
return audio_input, start_index return audio_mel, start_index
def mel_feature_extractor(self, audio_input): def mel_feature_extractor(self, audio_input):
"""Extract mel spectrogram features """Extract mel spectrogram features
View File
View File
+17
View File
@@ -0,0 +1,17 @@
import os, subprocess
def ensure_wav(input_path: str, target_path: str | None = None) -> str:
"""
Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
Returns path to the converted .wav (original if already correct).
"""
if not isinstance(input_path, str) or not os.path.exists(input_path):
return input_path
base, ext = os.path.splitext(input_path)
ext = ext.lower()
if target_path is None:
target_path = base + "_16k.wav"
cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return target_path
View File
+2 -1
View File
@@ -118,7 +118,8 @@ def get_landmark_and_bbox(img_list,upperbondrange =0):
if upperbondrange != 0: if upperbondrange != 0:
half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整 + 向下(偏29) - 向上(偏28) half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整 + 向下(偏29) - 向上(偏28)
half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1] half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
upper_bond = half_face_coord[1]-half_face_dist min_upper_bond = 0
upper_bond = max(min_upper_bond, half_face_coord[1] - half_face_dist)
f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1])) f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
x1, y1, x2, y2 = f_landmark x1, y1, x2, y2 = f_landmark
+15
View File
@@ -0,0 +1,15 @@
[build-system]
requires = ["setuptools>=64"]
build-backend = "setuptools.build_meta"
[project]
name = "musetalk"
version = "1.5.0"
description = "MuseTalk: audio-driven lip-sync (source-only install; dependencies managed by the consumer)"
readme = "README.md"
requires-python = ">=3.10"
license = { text = "MIT" }
[tool.setuptools.packages.find]
include = ["musetalk*"]
exclude = ["scripts*", "assets*", "data*", "configs*"]
+4 -4
View File
@@ -1,6 +1,9 @@
import os import os
import argparse import argparse
import subprocess import subprocess
import torch
import numpy as np
from tqdm import tqdm
from omegaconf import OmegaConf from omegaconf import OmegaConf
from typing import Tuple, List, Union from typing import Tuple, List, Union
import decord import decord
@@ -9,9 +12,6 @@ import cv2
from musetalk.utils.face_detection import FaceAlignment,LandmarksType from musetalk.utils.face_detection import FaceAlignment,LandmarksType
from mmpose.apis import inference_topdown, init_model from mmpose.apis import inference_topdown, init_model
from mmpose.structures import merge_data_samples from mmpose.structures import merge_data_samples
import torch
import numpy as np
from tqdm import tqdm
import sys import sys
def fast_check_ffmpeg(): def fast_check_ffmpeg():
@@ -331,4 +331,4 @@ if __name__ == "__main__":
config = OmegaConf.load(args.config) config = OmegaConf.load(args.config)
main(config) main(config)
+1
View File
@@ -235,6 +235,7 @@ class Avatar:
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame) cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
self.idx = self.idx + 1 self.idx = self.idx + 1
@torch.no_grad()
def inference(self, audio_path, out_vid_name, fps, skip_save_images): def inference(self, audio_path, out_vid_name, fps, skip_save_images):
os.makedirs(self.avatar_path + '/tmp', exist_ok=True) os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
print("start inference") print("start inference")