Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ca5b7a8f28 | |||
| 0a89dec45a | |||
| 8c19579b1e | |||
| 9deb9bea0d | |||
| 6e39bd0d00 | |||
| 26ca7c2c03 | |||
| 8ca7d1884c | |||
| 67e7ee3c73 |
+1
-1
@@ -5,7 +5,7 @@
|
|||||||
*.pyc
|
*.pyc
|
||||||
.ipynb_checkpoints
|
.ipynb_checkpoints
|
||||||
results/
|
results/
|
||||||
models/
|
/models/
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
*$py.class
|
*$py.class
|
||||||
|
|||||||
+9
-13
@@ -14,32 +14,28 @@ mkdir %CheckpointsDir%\sd-vae-ft-mse
|
|||||||
mkdir %CheckpointsDir%\whisper
|
mkdir %CheckpointsDir%\whisper
|
||||||
|
|
||||||
:: Install required packages
|
:: Install required packages
|
||||||
pip install -U "huggingface_hub[cli]"
|
pip install -U "huggingface_hub[hf_xet]"
|
||||||
pip install gdown
|
|
||||||
|
|
||||||
:: Set HuggingFace endpoint
|
:: Set HuggingFace endpoint
|
||||||
set HF_ENDPOINT=https://hf-mirror.com
|
set HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
:: Download MuseTalk weights
|
:: Download MuseTalk weights
|
||||||
huggingface-cli download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
|
hf download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
|
||||||
|
|
||||||
:: Download SD VAE weights
|
:: Download SD VAE weights
|
||||||
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
|
hf download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
|
||||||
|
|
||||||
:: Download Whisper weights
|
:: Download Whisper weights
|
||||||
huggingface-cli download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
|
hf download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
|
||||||
|
|
||||||
:: Download DWPose weights
|
:: Download DWPose weights
|
||||||
huggingface-cli download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
|
hf download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
|
||||||
|
|
||||||
:: Download SyncNet weights
|
:: Download SyncNet weights
|
||||||
huggingface-cli download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
|
hf download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
|
||||||
|
|
||||||
:: Download Face Parse Bisent weights (using gdown)
|
:: Download face-parse-bisent weights
|
||||||
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O %CheckpointsDir%\face-parse-bisent\79999_iter.pth
|
hf download ManyOtherFunctions/face-parse-bisent --local-dir %CheckpointsDir%\face-parse-bisent --include "79999_iter.pth" "resnet18-5c106cde.pth"
|
||||||
|
|
||||||
:: Download ResNet weights
|
|
||||||
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o %CheckpointsDir%\face-parse-bisent\resnet18-5c106cde.pth
|
|
||||||
|
|
||||||
echo All weights have been downloaded successfully!
|
echo All weights have been downloaded successfully!
|
||||||
endlocal
|
endlocal
|
||||||
|
|||||||
+27
-13
@@ -4,34 +4,48 @@
|
|||||||
CheckpointsDir="models"
|
CheckpointsDir="models"
|
||||||
|
|
||||||
# Create necessary directories
|
# Create necessary directories
|
||||||
mkdir -p $CheckpointsDir/{musetalk,musetalkV15,syncnet,dwpose,face-parse-bisent,sd-vae-ft-mse,whisper}
|
mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper
|
||||||
|
|
||||||
# Install required packages
|
# Install required packages
|
||||||
pip install -U "huggingface_hub[cli]"
|
pip install -U "huggingface_hub[cli]"
|
||||||
pip install gdown
|
pip install gdown
|
||||||
|
|
||||||
# Set HuggingFace endpoint
|
# Set HuggingFace mirror endpoint
|
||||||
export HF_ENDPOINT=https://hf-mirror.com
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
# Download MuseTalk weights
|
# Download MuseTalk V1.0 weights
|
||||||
huggingface-cli download TMElyralab/MuseTalk --local-dir $CheckpointsDir
|
huggingface-cli download TMElyralab/MuseTalk \
|
||||||
|
--local-dir $CheckpointsDir \
|
||||||
|
--include "musetalk/musetalk.json" "musetalk/pytorch_model.bin"
|
||||||
|
|
||||||
|
# Download MuseTalk V1.5 weights (unet.pth)
|
||||||
|
huggingface-cli download TMElyralab/MuseTalk \
|
||||||
|
--local-dir $CheckpointsDir \
|
||||||
|
--include "musetalkV15/musetalk.json" "musetalkV15/unet.pth"
|
||||||
|
|
||||||
# Download SD VAE weights
|
# Download SD VAE weights
|
||||||
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir $CheckpointsDir/sd-vae --include "config.json" "diffusion_pytorch_model.bin"
|
huggingface-cli download stabilityai/sd-vae-ft-mse \
|
||||||
|
--local-dir $CheckpointsDir/sd-vae \
|
||||||
|
--include "config.json" "diffusion_pytorch_model.bin"
|
||||||
|
|
||||||
# Download Whisper weights
|
# Download Whisper weights
|
||||||
huggingface-cli download openai/whisper-tiny --local-dir $CheckpointsDir/whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
|
huggingface-cli download openai/whisper-tiny \
|
||||||
|
--local-dir $CheckpointsDir/whisper \
|
||||||
|
--include "config.json" "pytorch_model.bin" "preprocessor_config.json"
|
||||||
|
|
||||||
# Download DWPose weights
|
# Download DWPose weights
|
||||||
huggingface-cli download yzd-v/DWPose --local-dir $CheckpointsDir/dwpose --include "dw-ll_ucoco_384.pth"
|
huggingface-cli download yzd-v/DWPose \
|
||||||
|
--local-dir $CheckpointsDir/dwpose \
|
||||||
|
--include "dw-ll_ucoco_384.pth"
|
||||||
|
|
||||||
# Download SyncNet weights
|
# Download SyncNet weights
|
||||||
huggingface-cli download ByteDance/LatentSync --local-dir $CheckpointsDir/syncnet --include "latentsync_syncnet.pt"
|
huggingface-cli download ByteDance/LatentSync \
|
||||||
|
--local-dir $CheckpointsDir/syncnet \
|
||||||
|
--include "latentsync_syncnet.pt"
|
||||||
|
|
||||||
# Download Face Parse Bisent weights (using gdown)
|
# Download Face Parse Bisent weights
|
||||||
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
|
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
|
||||||
|
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \
|
||||||
|
-o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
|
||||||
|
|
||||||
# Download ResNet weights
|
echo "✅ All weights have been downloaded successfully!"
|
||||||
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
|
|
||||||
|
|
||||||
echo "All weights have been downloaded successfully!"
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from decord.ndarray import cpu
|
|||||||
|
|
||||||
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
|
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
|
||||||
from musetalk.data import audio
|
from musetalk.data import audio
|
||||||
|
from musetalk.utils.audio_utils import ensure_wav
|
||||||
|
|
||||||
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
|
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
|
||||||
|
|
||||||
@@ -171,7 +172,8 @@ class FaceDataset(Dataset):
|
|||||||
"""
|
"""
|
||||||
if not os.path.exists(wav_path):
|
if not os.path.exists(wav_path):
|
||||||
return None
|
return None
|
||||||
audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
|
wav_path_converted = ensure_wav(wav_path)
|
||||||
|
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
|
||||||
assert sampling_rate == 16000
|
assert sampling_rate == 16000
|
||||||
|
|
||||||
while start_index >= 25 * 30:
|
while start_index >= 25 * 30:
|
||||||
@@ -206,11 +208,12 @@ class FaceDataset(Dataset):
|
|||||||
if not os.path.exists(wav_path):
|
if not os.path.exists(wav_path):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
|
wav_path_converted = ensure_wav(wav_path)
|
||||||
|
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
|
||||||
assert sampling_rate == 16000
|
assert sampling_rate == 16000
|
||||||
|
|
||||||
audio_input = self.mel_feature_extractor(audio_input)
|
audio_mel = self.mel_feature_extractor(audio_input_librosa)
|
||||||
return audio_input, start_index
|
return audio_mel, start_index
|
||||||
|
|
||||||
def mel_feature_extractor(self, audio_input):
|
def mel_feature_extractor(self, audio_input):
|
||||||
"""Extract mel spectrogram features
|
"""Extract mel spectrogram features
|
||||||
|
|||||||
@@ -0,0 +1,17 @@
|
|||||||
|
import os, subprocess
|
||||||
|
|
||||||
|
def ensure_wav(input_path: str, target_path: str | None = None) -> str:
|
||||||
|
"""
|
||||||
|
Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
|
||||||
|
Returns path to the converted .wav (original if already correct).
|
||||||
|
"""
|
||||||
|
if not isinstance(input_path, str) or not os.path.exists(input_path):
|
||||||
|
return input_path
|
||||||
|
base, ext = os.path.splitext(input_path)
|
||||||
|
ext = ext.lower()
|
||||||
|
|
||||||
|
if target_path is None:
|
||||||
|
target_path = base + "_16k.wav"
|
||||||
|
cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
|
||||||
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||||
|
return target_path
|
||||||
@@ -118,7 +118,8 @@ def get_landmark_and_bbox(img_list,upperbondrange =0):
|
|||||||
if upperbondrange != 0:
|
if upperbondrange != 0:
|
||||||
half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整 + 向下(偏29) - 向上(偏28)
|
half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整 + 向下(偏29) - 向上(偏28)
|
||||||
half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
|
half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
|
||||||
upper_bond = half_face_coord[1]-half_face_dist
|
min_upper_bond = 0
|
||||||
|
upper_bond = max(min_upper_bond, half_face_coord[1] - half_face_dist)
|
||||||
|
|
||||||
f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
|
f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
|
||||||
x1, y1, x2, y2 = f_landmark
|
x1, y1, x2, y2 = f_landmark
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=64"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "musetalk"
|
||||||
|
version = "1.5.0"
|
||||||
|
description = "MuseTalk: audio-driven lip-sync (source-only install; dependencies managed by the consumer)"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
license = { text = "MIT" }
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
include = ["musetalk*"]
|
||||||
|
exclude = ["scripts*", "assets*", "data*", "configs*"]
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
from typing import Tuple, List, Union
|
from typing import Tuple, List, Union
|
||||||
import decord
|
import decord
|
||||||
@@ -9,9 +12,6 @@ import cv2
|
|||||||
from musetalk.utils.face_detection import FaceAlignment,LandmarksType
|
from musetalk.utils.face_detection import FaceAlignment,LandmarksType
|
||||||
from mmpose.apis import inference_topdown, init_model
|
from mmpose.apis import inference_topdown, init_model
|
||||||
from mmpose.structures import merge_data_samples
|
from mmpose.structures import merge_data_samples
|
||||||
import torch
|
|
||||||
import numpy as np
|
|
||||||
from tqdm import tqdm
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
def fast_check_ffmpeg():
|
def fast_check_ffmpeg():
|
||||||
@@ -331,4 +331,4 @@ if __name__ == "__main__":
|
|||||||
config = OmegaConf.load(args.config)
|
config = OmegaConf.load(args.config)
|
||||||
|
|
||||||
main(config)
|
main(config)
|
||||||
|
|
||||||
|
|||||||
@@ -235,6 +235,7 @@ class Avatar:
|
|||||||
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
|
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
|
||||||
self.idx = self.idx + 1
|
self.idx = self.idx + 1
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
def inference(self, audio_path, out_vid_name, fps, skip_save_images):
|
def inference(self, audio_path, out_vid_name, fps, skip_save_images):
|
||||||
os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
|
os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
|
||||||
print("start inference")
|
print("start inference")
|
||||||
|
|||||||
Reference in New Issue
Block a user