8 Commits

Author SHA1 Message Date
bhetherman ca5b7a8f28 Make musetalk a proper Python package
Add missing __init__.py files so `musetalk.*` subpackages import
without relying on namespace-package behavior or sys.path hacks, and
narrow the root `/models/` gitignore so `musetalk/models/__init__.py`
is tracked.

Also add a minimal pyproject.toml so the source can be installed with
`pip install .` (or `pip install -e .`) without pulling the pinned
requirements.txt, which consumers typically need to override.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 23:02:52 -04:00
李洋 0a89dec45a feat: update download_weights.bat (#372)
更换weights下载工具
更换face-parse-bisent源
2025-09-26 13:44:17 +08:00
Alexey 8c19579b1e fix: convert all audio to WAV 16kHz PCM before processing (#379) 2025-09-26 13:42:23 +08:00
yykani 9deb9bea0d fix: ensure upper bond does not go below zero in landmark extraction (#329) 2025-07-02 16:42:35 +08:00
Nick Davis 6e39bd0d00 fix: preprocess import bug (#345)
Fixing the issue when executing the script "python -m scripts.preprocess --config ./configs/training/preprocess.yaml" in the Ubuntu environment, due to the "import torch" order problem, the execution of "from musetalk.utils.face_detection import FaceAlignment, LandmarksType" hangs.
2025-07-02 16:40:49 +08:00
GaoLeiA 26ca7c2c03 fix: use torch.no_grad() in inference to prevent excessive memory usage (~30GB) with inference (#349) 2025-07-02 16:38:56 +08:00
Wei Lin Liu 8ca7d1884c fix: download_weights.sh (#318)
Fixed wrong mkdir syntax, wrong install location of face-parse-bisent and improve readability
2025-04-22 16:49:40 +08:00
Zhizhou Zhong 67e7ee3c73 feat: windows infer & gradio (#312)
* fix: windows infer

* docs: update readme

* docs: update readme

* feat: v1.5 gradio for windows&linux

* fix: dependencies

* feat: windows infer & gradio

---------

Co-authored-by: NeRF-Factory <zzhizhou66@gmail.com>
2025-04-12 23:22:22 +08:00
14 changed files with 83 additions and 36 deletions
+1 -1
View File
@@ -5,7 +5,7 @@
*.pyc
.ipynb_checkpoints
results/
models/
/models/
**/__pycache__/
*.py[cod]
*$py.class
+8 -12
View File
@@ -14,32 +14,28 @@ mkdir %CheckpointsDir%\sd-vae-ft-mse
mkdir %CheckpointsDir%\whisper
:: Install required packages
pip install -U "huggingface_hub[cli]"
pip install gdown
pip install -U "huggingface_hub[hf_xet]"
:: Set HuggingFace endpoint
set HF_ENDPOINT=https://hf-mirror.com
:: Download MuseTalk weights
huggingface-cli download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
hf download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
:: Download SD VAE weights
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
hf download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
:: Download Whisper weights
huggingface-cli download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
hf download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
:: Download DWPose weights
huggingface-cli download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
hf download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
:: Download SyncNet weights
huggingface-cli download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
hf download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
:: Download Face Parse Bisent weights (using gdown)
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O %CheckpointsDir%\face-parse-bisent\79999_iter.pth
:: Download ResNet weights
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o %CheckpointsDir%\face-parse-bisent\resnet18-5c106cde.pth
:: Download face-parse-bisent weights
hf download ManyOtherFunctions/face-parse-bisent --local-dir %CheckpointsDir%\face-parse-bisent --include "79999_iter.pth" "resnet18-5c106cde.pth"
echo All weights have been downloaded successfully!
endlocal
+27 -13
View File
@@ -4,34 +4,48 @@
CheckpointsDir="models"
# Create necessary directories
mkdir -p $CheckpointsDir/{musetalk,musetalkV15,syncnet,dwpose,face-parse-bisent,sd-vae-ft-mse,whisper}
mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper
# Install required packages
pip install -U "huggingface_hub[cli]"
pip install gdown
# Set HuggingFace endpoint
# Set HuggingFace mirror endpoint
export HF_ENDPOINT=https://hf-mirror.com
# Download MuseTalk weights
huggingface-cli download TMElyralab/MuseTalk --local-dir $CheckpointsDir
# Download MuseTalk V1.0 weights
huggingface-cli download TMElyralab/MuseTalk \
--local-dir $CheckpointsDir \
--include "musetalk/musetalk.json" "musetalk/pytorch_model.bin"
# Download MuseTalk V1.5 weights (unet.pth)
huggingface-cli download TMElyralab/MuseTalk \
--local-dir $CheckpointsDir \
--include "musetalkV15/musetalk.json" "musetalkV15/unet.pth"
# Download SD VAE weights
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir $CheckpointsDir/sd-vae --include "config.json" "diffusion_pytorch_model.bin"
huggingface-cli download stabilityai/sd-vae-ft-mse \
--local-dir $CheckpointsDir/sd-vae \
--include "config.json" "diffusion_pytorch_model.bin"
# Download Whisper weights
huggingface-cli download openai/whisper-tiny --local-dir $CheckpointsDir/whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
huggingface-cli download openai/whisper-tiny \
--local-dir $CheckpointsDir/whisper \
--include "config.json" "pytorch_model.bin" "preprocessor_config.json"
# Download DWPose weights
huggingface-cli download yzd-v/DWPose --local-dir $CheckpointsDir/dwpose --include "dw-ll_ucoco_384.pth"
huggingface-cli download yzd-v/DWPose \
--local-dir $CheckpointsDir/dwpose \
--include "dw-ll_ucoco_384.pth"
# Download SyncNet weights
huggingface-cli download ByteDance/LatentSync --local-dir $CheckpointsDir/syncnet --include "latentsync_syncnet.pt"
huggingface-cli download ByteDance/LatentSync \
--local-dir $CheckpointsDir/syncnet \
--include "latentsync_syncnet.pt"
# Download Face Parse Bisent weights (using gdown)
# Download Face Parse Bisent weights
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \
-o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
# Download ResNet weights
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
echo "All weights have been downloaded successfully!"
echo "✅ All weights have been downloaded successfully!"
View File
View File
+7 -4
View File
@@ -15,6 +15,7 @@ from decord.ndarray import cpu
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
from musetalk.data import audio
from musetalk.utils.audio_utils import ensure_wav
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
@@ -171,7 +172,8 @@ class FaceDataset(Dataset):
"""
if not os.path.exists(wav_path):
return None
audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
wav_path_converted = ensure_wav(wav_path)
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
assert sampling_rate == 16000
while start_index >= 25 * 30:
@@ -206,11 +208,12 @@ class FaceDataset(Dataset):
if not os.path.exists(wav_path):
return None
audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
wav_path_converted = ensure_wav(wav_path)
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
assert sampling_rate == 16000
audio_input = self.mel_feature_extractor(audio_input)
return audio_input, start_index
audio_mel = self.mel_feature_extractor(audio_input_librosa)
return audio_mel, start_index
def mel_feature_extractor(self, audio_input):
"""Extract mel spectrogram features
View File
View File
+17
View File
@@ -0,0 +1,17 @@
import os, subprocess
def ensure_wav(input_path: str, target_path: str | None = None) -> str:
"""
Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
Returns path to the converted .wav (original if already correct).
"""
if not isinstance(input_path, str) or not os.path.exists(input_path):
return input_path
base, ext = os.path.splitext(input_path)
ext = ext.lower()
if target_path is None:
target_path = base + "_16k.wav"
cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return target_path
View File
+2 -1
View File
@@ -118,7 +118,8 @@ def get_landmark_and_bbox(img_list,upperbondrange =0):
if upperbondrange != 0:
half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整 + 向下(偏29) - 向上(偏28)
half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
upper_bond = half_face_coord[1]-half_face_dist
min_upper_bond = 0
upper_bond = max(min_upper_bond, half_face_coord[1] - half_face_dist)
f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
x1, y1, x2, y2 = f_landmark
+15
View File
@@ -0,0 +1,15 @@
[build-system]
requires = ["setuptools>=64"]
build-backend = "setuptools.build_meta"
[project]
name = "musetalk"
version = "1.5.0"
description = "MuseTalk: audio-driven lip-sync (source-only install; dependencies managed by the consumer)"
readme = "README.md"
requires-python = ">=3.10"
license = { text = "MIT" }
[tool.setuptools.packages.find]
include = ["musetalk*"]
exclude = ["scripts*", "assets*", "data*", "configs*"]
+3 -3
View File
@@ -1,6 +1,9 @@
import os
import argparse
import subprocess
import torch
import numpy as np
from tqdm import tqdm
from omegaconf import OmegaConf
from typing import Tuple, List, Union
import decord
@@ -9,9 +12,6 @@ import cv2
from musetalk.utils.face_detection import FaceAlignment,LandmarksType
from mmpose.apis import inference_topdown, init_model
from mmpose.structures import merge_data_samples
import torch
import numpy as np
from tqdm import tqdm
import sys
def fast_check_ffmpeg():
+1
View File
@@ -235,6 +235,7 @@ class Avatar:
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
self.idx = self.idx + 1
@torch.no_grad()
def inference(self, audio_path, out_vid_name, fps, skip_save_images):
os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
print("start inference")