feat: windows infer & gradio

fix: dependencies
feat: v1.5 gradio for windows&linux
2025-04-12 23:19:41 +08:00 · 2025-04-12 01:40:40 +08:00 · 2025-04-11 02:43:04 +08:00 · 2025-04-10 14:02:24 +08:00 · 2025-04-10 13:59:59 +08:00 · 2025-04-10 13:54:16 +08:00
14 changed files with 36 additions and 83 deletions
@@ -5,7 +5,7 @@
 *.pyc
 .ipynb_checkpoints
 results/
-/models/
+models/
 **/__pycache__/
 *.py[cod]
 *$py.class
@@ -14,28 +14,32 @@ mkdir %CheckpointsDir%\sd-vae-ft-mse
 mkdir %CheckpointsDir%\whisper

 :: Install required packages
-pip install -U "huggingface_hub[hf_xet]"
+pip install -U "huggingface_hub[cli]"
+pip install gdown

 :: Set HuggingFace endpoint
 set HF_ENDPOINT=https://hf-mirror.com

 :: Download MuseTalk weights
-hf download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
+huggingface-cli download TMElyralab/MuseTalk --local-dir %CheckpointsDir%

 :: Download SD VAE weights
-hf download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
+huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"

 :: Download Whisper weights
-hf download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
+huggingface-cli download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"

 :: Download DWPose weights
-hf download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
+huggingface-cli download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"

 :: Download SyncNet weights
-hf download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
+huggingface-cli download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"

-:: Download face-parse-bisent weights
-hf download ManyOtherFunctions/face-parse-bisent --local-dir %CheckpointsDir%\face-parse-bisent --include "79999_iter.pth" "resnet18-5c106cde.pth"
+:: Download Face Parse Bisent weights (using gdown)
+gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O %CheckpointsDir%\face-parse-bisent\79999_iter.pth
+
+:: Download ResNet weights
+curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o %CheckpointsDir%\face-parse-bisent\resnet18-5c106cde.pth

 echo All weights have been downloaded successfully!
-endlocal 
+endlocal 
@@ -4,48 +4,34 @@
 CheckpointsDir="models"

 # Create necessary directories
-mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper
+mkdir -p $CheckpointsDir/{musetalk,musetalkV15,syncnet,dwpose,face-parse-bisent,sd-vae-ft-mse,whisper}

 # Install required packages
 pip install -U "huggingface_hub[cli]"
 pip install gdown

-# Set HuggingFace mirror endpoint
+# Set HuggingFace endpoint
 export HF_ENDPOINT=https://hf-mirror.com

-# Download MuseTalk V1.0 weights
-huggingface-cli download TMElyralab/MuseTalk \
-  --local-dir $CheckpointsDir \
-  --include "musetalk/musetalk.json" "musetalk/pytorch_model.bin"
-
-# Download MuseTalk V1.5 weights (unet.pth)
-huggingface-cli download TMElyralab/MuseTalk \
-  --local-dir $CheckpointsDir \
-  --include "musetalkV15/musetalk.json" "musetalkV15/unet.pth"
+# Download MuseTalk weights
+huggingface-cli download TMElyralab/MuseTalk --local-dir $CheckpointsDir

 # Download SD VAE weights
-huggingface-cli download stabilityai/sd-vae-ft-mse \
-  --local-dir $CheckpointsDir/sd-vae \
-  --include "config.json" "diffusion_pytorch_model.bin"
+huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir $CheckpointsDir/sd-vae --include "config.json" "diffusion_pytorch_model.bin"

 # Download Whisper weights
-huggingface-cli download openai/whisper-tiny \
-  --local-dir $CheckpointsDir/whisper \
-  --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
+huggingface-cli download openai/whisper-tiny --local-dir $CheckpointsDir/whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"

 # Download DWPose weights
-huggingface-cli download yzd-v/DWPose \
-  --local-dir $CheckpointsDir/dwpose \
-  --include "dw-ll_ucoco_384.pth"
+huggingface-cli download yzd-v/DWPose --local-dir $CheckpointsDir/dwpose --include "dw-ll_ucoco_384.pth"

 # Download SyncNet weights
-huggingface-cli download ByteDance/LatentSync \
-  --local-dir $CheckpointsDir/syncnet \
-  --include "latentsync_syncnet.pt"
+huggingface-cli download ByteDance/LatentSync --local-dir $CheckpointsDir/syncnet --include "latentsync_syncnet.pt"

-# Download Face Parse Bisent weights
+# Download Face Parse Bisent weights (using gdown)
 gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
-curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \
-  -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth

-echo "✅ All weights have been downloaded successfully!" 
+# Download ResNet weights
+curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
+
+echo "All weights have been downloaded successfully!" 
@@ -15,7 +15,6 @@ from decord.ndarray import cpu

 from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark 
 from musetalk.data import audio 
-from musetalk.utils.audio_utils import ensure_wav

 syncnet_mel_step_size = math.ceil(16 / 5 * 16)  # latentsync

@@ -172,8 +171,7 @@ class FaceDataset(Dataset):
        """
        if not os.path.exists(wav_path):
            return None
-        wav_path_converted = ensure_wav(wav_path)
-        audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
+        audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
        assert sampling_rate == 16000

        while start_index >= 25 * 30:
@@ -208,12 +206,11 @@ class FaceDataset(Dataset):
        if not os.path.exists(wav_path):
            return None

-        wav_path_converted = ensure_wav(wav_path)
-        audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
+        audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
        assert sampling_rate == 16000

-        audio_mel = self.mel_feature_extractor(audio_input_librosa)
-        return audio_mel, start_index
+        audio_input = self.mel_feature_extractor(audio_input)
+        return audio_input, start_index

    def mel_feature_extractor(self, audio_input):
        """Extract mel spectrogram features
@@ -1,17 +0,0 @@
-import os, subprocess
-
-def ensure_wav(input_path: str, target_path: str | None = None) -> str:
-    """
-    Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
-    Returns path to the converted .wav (original if already correct).
-    """
-    if not isinstance(input_path, str) or not os.path.exists(input_path):
-        return input_path
-    base, ext = os.path.splitext(input_path)
-    ext = ext.lower()
-    
-    if target_path is None:
-        target_path = base + "_16k.wav"
-    cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
-    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-    return target_path
@@ -118,8 +118,7 @@ def get_landmark_and_bbox(img_list,upperbondrange =0):
            if upperbondrange != 0:
                half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整  + 向下（偏29）  - 向上（偏28）
            half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
-            min_upper_bond = 0
-            upper_bond = max(min_upper_bond, half_face_coord[1] - half_face_dist)
+            upper_bond = half_face_coord[1]-half_face_dist
            
            f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
            x1, y1, x2, y2 = f_landmark
@@ -1,15 +0,0 @@
-[build-system]
-requires = ["setuptools>=64"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "musetalk"
-version = "1.5.0"
-description = "MuseTalk: audio-driven lip-sync (source-only install; dependencies managed by the consumer)"
-readme = "README.md"
-requires-python = ">=3.10"
-license = { text = "MIT" }
-
-[tool.setuptools.packages.find]
-include = ["musetalk*"]
-exclude = ["scripts*", "assets*", "data*", "configs*"]
@@ -1,9 +1,6 @@
 import os
 import argparse
 import subprocess
-import torch
-import numpy as np
-from tqdm import tqdm
 from omegaconf import OmegaConf
 from typing import Tuple, List, Union
 import decord
@@ -12,6 +9,9 @@ import cv2
 from musetalk.utils.face_detection import FaceAlignment,LandmarksType
 from mmpose.apis import inference_topdown, init_model
 from mmpose.structures import merge_data_samples
+import torch
+import numpy as np
+from tqdm import tqdm
 import sys

 def fast_check_ffmpeg():
@@ -331,4 +331,4 @@ if __name__ == "__main__":
    config = OmegaConf.load(args.config)

    main(config)
-    
+    
@@ -235,7 +235,6 @@ class Avatar:
                cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
            self.idx = self.idx + 1

-    @torch.no_grad()
    def inference(self, audio_path, out_vid_name, fps, skip_save_images):
        os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
        print("start inference")
Author	SHA1	Message	Date
NeRF-Factory	8795fa1425	feat: windows infer & gradio	2025-04-12 23:19:41 +08:00
zzzweakman	be656b199b	fix: dependencies	2025-04-12 01:40:40 +08:00
zzzweakman	b9b459a119	feat: v1.5 gradio for windows&linux	2025-04-11 02:43:04 +08:00
zzzweakman	2e5b74a257	docs: update readme	2025-04-10 14:02:24 +08:00
zzzweakman	a0834ec2c2	docs: update readme	2025-04-10 13:59:59 +08:00
zzzweakman	0702078902	fix: windows infer	2025-04-10 13:54:16 +08:00