6 Commits

Author SHA1 Message Date
NeRF-Factory 8795fa1425 feat: windows infer & gradio 2025-04-12 23:19:41 +08:00
zzzweakman be656b199b fix: dependencies 2025-04-12 01:40:40 +08:00
zzzweakman b9b459a119 feat: v1.5 gradio for windows&linux 2025-04-11 02:43:04 +08:00
zzzweakman 2e5b74a257 docs: update readme 2025-04-10 14:02:24 +08:00
zzzweakman a0834ec2c2 docs: update readme 2025-04-10 13:59:59 +08:00
zzzweakman 0702078902 fix: windows infer 2025-04-10 13:54:16 +08:00
14 changed files with 36 additions and 83 deletions
+1 -1
View File
@@ -5,7 +5,7 @@
*.pyc
.ipynb_checkpoints
results/
/models/
models/
**/__pycache__/
*.py[cod]
*$py.class
+13 -9
View File
@@ -14,28 +14,32 @@ mkdir %CheckpointsDir%\sd-vae-ft-mse
mkdir %CheckpointsDir%\whisper
:: Install required packages
pip install -U "huggingface_hub[hf_xet]"
pip install -U "huggingface_hub[cli]"
pip install gdown
:: Set HuggingFace endpoint
set HF_ENDPOINT=https://hf-mirror.com
:: Download MuseTalk weights
hf download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
huggingface-cli download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
:: Download SD VAE weights
hf download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
:: Download Whisper weights
hf download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
huggingface-cli download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
:: Download DWPose weights
hf download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
huggingface-cli download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
:: Download SyncNet weights
hf download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
huggingface-cli download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
:: Download face-parse-bisent weights
hf download ManyOtherFunctions/face-parse-bisent --local-dir %CheckpointsDir%\face-parse-bisent --include "79999_iter.pth" "resnet18-5c106cde.pth"
:: Download Face Parse Bisent weights (using gdown)
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O %CheckpointsDir%\face-parse-bisent\79999_iter.pth
:: Download ResNet weights
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o %CheckpointsDir%\face-parse-bisent\resnet18-5c106cde.pth
echo All weights have been downloaded successfully!
endlocal
endlocal
+13 -27
View File
@@ -4,48 +4,34 @@
CheckpointsDir="models"
# Create necessary directories
mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper
mkdir -p $CheckpointsDir/{musetalk,musetalkV15,syncnet,dwpose,face-parse-bisent,sd-vae-ft-mse,whisper}
# Install required packages
pip install -U "huggingface_hub[cli]"
pip install gdown
# Set HuggingFace mirror endpoint
# Set HuggingFace endpoint
export HF_ENDPOINT=https://hf-mirror.com
# Download MuseTalk V1.0 weights
huggingface-cli download TMElyralab/MuseTalk \
--local-dir $CheckpointsDir \
--include "musetalk/musetalk.json" "musetalk/pytorch_model.bin"
# Download MuseTalk V1.5 weights (unet.pth)
huggingface-cli download TMElyralab/MuseTalk \
--local-dir $CheckpointsDir \
--include "musetalkV15/musetalk.json" "musetalkV15/unet.pth"
# Download MuseTalk weights
huggingface-cli download TMElyralab/MuseTalk --local-dir $CheckpointsDir
# Download SD VAE weights
huggingface-cli download stabilityai/sd-vae-ft-mse \
--local-dir $CheckpointsDir/sd-vae \
--include "config.json" "diffusion_pytorch_model.bin"
huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir $CheckpointsDir/sd-vae --include "config.json" "diffusion_pytorch_model.bin"
# Download Whisper weights
huggingface-cli download openai/whisper-tiny \
--local-dir $CheckpointsDir/whisper \
--include "config.json" "pytorch_model.bin" "preprocessor_config.json"
huggingface-cli download openai/whisper-tiny --local-dir $CheckpointsDir/whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
# Download DWPose weights
huggingface-cli download yzd-v/DWPose \
--local-dir $CheckpointsDir/dwpose \
--include "dw-ll_ucoco_384.pth"
huggingface-cli download yzd-v/DWPose --local-dir $CheckpointsDir/dwpose --include "dw-ll_ucoco_384.pth"
# Download SyncNet weights
huggingface-cli download ByteDance/LatentSync \
--local-dir $CheckpointsDir/syncnet \
--include "latentsync_syncnet.pt"
huggingface-cli download ByteDance/LatentSync --local-dir $CheckpointsDir/syncnet --include "latentsync_syncnet.pt"
# Download Face Parse Bisent weights
# Download Face Parse Bisent weights (using gdown)
gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \
-o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
echo "✅ All weights have been downloaded successfully!"
# Download ResNet weights
curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
echo "All weights have been downloaded successfully!"
View File
View File
+4 -7
View File
@@ -15,7 +15,6 @@ from decord.ndarray import cpu
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
from musetalk.data import audio
from musetalk.utils.audio_utils import ensure_wav
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
@@ -172,8 +171,7 @@ class FaceDataset(Dataset):
"""
if not os.path.exists(wav_path):
return None
wav_path_converted = ensure_wav(wav_path)
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
assert sampling_rate == 16000
while start_index >= 25 * 30:
@@ -208,12 +206,11 @@ class FaceDataset(Dataset):
if not os.path.exists(wav_path):
return None
wav_path_converted = ensure_wav(wav_path)
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
assert sampling_rate == 16000
audio_mel = self.mel_feature_extractor(audio_input_librosa)
return audio_mel, start_index
audio_input = self.mel_feature_extractor(audio_input)
return audio_input, start_index
def mel_feature_extractor(self, audio_input):
"""Extract mel spectrogram features
View File
View File
-17
View File
@@ -1,17 +0,0 @@
import os, subprocess
def ensure_wav(input_path: str, target_path: str | None = None) -> str:
"""
Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
Returns path to the converted .wav (original if already correct).
"""
if not isinstance(input_path, str) or not os.path.exists(input_path):
return input_path
base, ext = os.path.splitext(input_path)
ext = ext.lower()
if target_path is None:
target_path = base + "_16k.wav"
cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return target_path
View File
+1 -2
View File
@@ -118,8 +118,7 @@ def get_landmark_and_bbox(img_list,upperbondrange =0):
if upperbondrange != 0:
half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整 + 向下(偏29) - 向上(偏28)
half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
min_upper_bond = 0
upper_bond = max(min_upper_bond, half_face_coord[1] - half_face_dist)
upper_bond = half_face_coord[1]-half_face_dist
f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
x1, y1, x2, y2 = f_landmark
-15
View File
@@ -1,15 +0,0 @@
[build-system]
requires = ["setuptools>=64"]
build-backend = "setuptools.build_meta"
[project]
name = "musetalk"
version = "1.5.0"
description = "MuseTalk: audio-driven lip-sync (source-only install; dependencies managed by the consumer)"
readme = "README.md"
requires-python = ">=3.10"
license = { text = "MIT" }
[tool.setuptools.packages.find]
include = ["musetalk*"]
exclude = ["scripts*", "assets*", "data*", "configs*"]
+4 -4
View File
@@ -1,9 +1,6 @@
import os
import argparse
import subprocess
import torch
import numpy as np
from tqdm import tqdm
from omegaconf import OmegaConf
from typing import Tuple, List, Union
import decord
@@ -12,6 +9,9 @@ import cv2
from musetalk.utils.face_detection import FaceAlignment,LandmarksType
from mmpose.apis import inference_topdown, init_model
from mmpose.structures import merge_data_samples
import torch
import numpy as np
from tqdm import tqdm
import sys
def fast_check_ffmpeg():
@@ -331,4 +331,4 @@ if __name__ == "__main__":
config = OmegaConf.load(args.config)
main(config)
-1
View File
@@ -235,7 +235,6 @@ class Avatar:
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
self.idx = self.idx + 1
@torch.no_grad()
def inference(self, audio_path, out_vid_name, fps, skip_save_images):
os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
print("start inference")