feat: windows infer & gradio

fix: dependencies
feat: v1.5 gradio for windows&linux
2025-04-12 23:19:41 +08:00 · 2025-04-12 01:40:40 +08:00 · 2025-04-11 02:43:04 +08:00 · 2025-04-10 14:02:24 +08:00 · 2025-04-10 13:59:59 +08:00 · 2025-04-10 13:54:16 +08:00
14 changed files with 36 additions and 83 deletions
@@ -5,7 +5,7 @@
 *.pyc
 .ipynb_checkpoints
 results/
-/models/
+models/
 **/__pycache__/
 *.py[cod]
 *$py.class
@@ -14,28 +14,32 @@ mkdir %CheckpointsDir%\sd-vae-ft-mse
 mkdir %CheckpointsDir%\whisper
 :: Install required packages
-pip install -U "huggingface_hub[hf_xet]"
+pip install -U "huggingface_hub[cli]"
 pip install gdown
 :: Set HuggingFace endpoint
 set HF_ENDPOINT=https://hf-mirror.com
 :: Download MuseTalk weights
-hf download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
+huggingface-cli download TMElyralab/MuseTalk --local-dir %CheckpointsDir%
 :: Download SD VAE weights
-hf download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
+huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir %CheckpointsDir%\sd-vae --include "config.json" "diffusion_pytorch_model.bin"
 :: Download Whisper weights
-hf download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
+huggingface-cli download openai/whisper-tiny --local-dir %CheckpointsDir%\whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
 :: Download DWPose weights
-hf download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
+huggingface-cli download yzd-v/DWPose --local-dir %CheckpointsDir%\dwpose --include "dw-ll_ucoco_384.pth"
 :: Download SyncNet weights
-hf download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
+huggingface-cli download ByteDance/LatentSync --local-dir %CheckpointsDir%\syncnet --include "latentsync_syncnet.pt"
-:: Download face-parse-bisent weights
+:: Download Face Parse Bisent weights (using gdown)
-hf download ManyOtherFunctions/face-parse-bisent --local-dir %CheckpointsDir%\face-parse-bisent --include "79999_iter.pth" "resnet18-5c106cde.pth"
+gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O %CheckpointsDir%\face-parse-bisent\79999_iter.pth
 :: Download ResNet weights
 curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o %CheckpointsDir%\face-parse-bisent\resnet18-5c106cde.pth
 echo All weights have been downloaded successfully!
 endlocal 
@@ -4,48 +4,34 @@
 CheckpointsDir="models"
 # Create necessary directories
-mkdir -p models/musetalk models/musetalkV15 models/syncnet models/dwpose models/face-parse-bisent models/sd-vae models/whisper
+mkdir -p $CheckpointsDir/{musetalk,musetalkV15,syncnet,dwpose,face-parse-bisent,sd-vae-ft-mse,whisper}
 # Install required packages
 pip install -U "huggingface_hub[cli]"
 pip install gdown
-# Set HuggingFace mirror endpoint
+# Set HuggingFace endpoint
 export HF_ENDPOINT=https://hf-mirror.com
-# Download MuseTalk V1.0 weights
+# Download MuseTalk weights
-huggingface-cli download TMElyralab/MuseTalk \
+huggingface-cli download TMElyralab/MuseTalk --local-dir $CheckpointsDir
  --local-dir $CheckpointsDir \
  --include "musetalk/musetalk.json" "musetalk/pytorch_model.bin"
 # Download MuseTalk V1.5 weights (unet.pth)
 huggingface-cli download TMElyralab/MuseTalk \
  --local-dir $CheckpointsDir \
  --include "musetalkV15/musetalk.json" "musetalkV15/unet.pth"
 # Download SD VAE weights
-huggingface-cli download stabilityai/sd-vae-ft-mse \
+huggingface-cli download stabilityai/sd-vae-ft-mse --local-dir $CheckpointsDir/sd-vae --include "config.json" "diffusion_pytorch_model.bin"
  --local-dir $CheckpointsDir/sd-vae \
  --include "config.json" "diffusion_pytorch_model.bin"
 # Download Whisper weights
-huggingface-cli download openai/whisper-tiny \
+huggingface-cli download openai/whisper-tiny --local-dir $CheckpointsDir/whisper --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
  --local-dir $CheckpointsDir/whisper \
  --include "config.json" "pytorch_model.bin" "preprocessor_config.json"
 # Download DWPose weights
-huggingface-cli download yzd-v/DWPose \
+huggingface-cli download yzd-v/DWPose --local-dir $CheckpointsDir/dwpose --include "dw-ll_ucoco_384.pth"
  --local-dir $CheckpointsDir/dwpose \
  --include "dw-ll_ucoco_384.pth"
 # Download SyncNet weights
-huggingface-cli download ByteDance/LatentSync \
+huggingface-cli download ByteDance/LatentSync --local-dir $CheckpointsDir/syncnet --include "latentsync_syncnet.pt"
  --local-dir $CheckpointsDir/syncnet \
  --include "latentsync_syncnet.pt"
-# Download Face Parse Bisent weights
+# Download Face Parse Bisent weights (using gdown)
 gdown --id 154JgKpzCPW82qINcVieuPH3fZ2e0P812 -O $CheckpointsDir/face-parse-bisent/79999_iter.pth
 curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth \
  -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
-echo "✅ All weights have been downloaded successfully!" 
+# Download ResNet weights
 curl -L https://download.pytorch.org/models/resnet18-5c106cde.pth -o $CheckpointsDir/face-parse-bisent/resnet18-5c106cde.pth
 echo "All weights have been downloaded successfully!" 
@@ -15,7 +15,6 @@ from decord.ndarray import cpu
 from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark 
 from musetalk.data import audio 
 from musetalk.utils.audio_utils import ensure_wav
 syncnet_mel_step_size = math.ceil(16 / 5 * 16)  # latentsync
@@ -172,8 +171,7 @@ class FaceDataset(Dataset):
        """
        if not os.path.exists(wav_path):
            return None
-        wav_path_converted = ensure_wav(wav_path)
+        audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
        audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
        assert sampling_rate == 16000
        while start_index >= 25 * 30:
@@ -208,12 +206,11 @@ class FaceDataset(Dataset):
        if not os.path.exists(wav_path):
            return None
-        wav_path_converted = ensure_wav(wav_path)
+        audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
        audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
        assert sampling_rate == 16000
-        audio_mel = self.mel_feature_extractor(audio_input_librosa)
+        audio_input = self.mel_feature_extractor(audio_input)
-        return audio_mel, start_index
+        return audio_input, start_index
    def mel_feature_extractor(self, audio_input):
        """Extract mel spectrogram features
@@ -1,17 +0,0 @@
 import os, subprocess
 def ensure_wav(input_path: str, target_path: str | None = None) -> str:
    """
    Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
    Returns path to the converted .wav (original if already correct).
    """
    if not isinstance(input_path, str) or not os.path.exists(input_path):
        return input_path
    base, ext = os.path.splitext(input_path)
    ext = ext.lower()
    if target_path is None:
        target_path = base + "_16k.wav"
    cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return target_path
@@ -118,8 +118,7 @@ def get_landmark_and_bbox(img_list,upperbondrange =0):
            if upperbondrange != 0:
                half_face_coord[1] = upperbondrange+half_face_coord[1] #手动调整  + 向下（偏29）  - 向上（偏28）
            half_face_dist = np.max(face_land_mark[:,1]) - half_face_coord[1]
-            min_upper_bond = 0
+            upper_bond = half_face_coord[1]-half_face_dist
            upper_bond = max(min_upper_bond, half_face_coord[1] - half_face_dist)
            f_landmark = (np.min(face_land_mark[:, 0]),int(upper_bond),np.max(face_land_mark[:, 0]),np.max(face_land_mark[:,1]))
            x1, y1, x2, y2 = f_landmark
@@ -1,15 +0,0 @@
 [build-system]
 requires = ["setuptools>=64"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "musetalk"
 version = "1.5.0"
 description = "MuseTalk: audio-driven lip-sync (source-only install; dependencies managed by the consumer)"
 readme = "README.md"
 requires-python = ">=3.10"
 license = { text = "MIT" }
 [tool.setuptools.packages.find]
 include = ["musetalk*"]
 exclude = ["scripts*", "assets*", "data*", "configs*"]
@@ -1,9 +1,6 @@
 import os
 import argparse
 import subprocess
 import torch
 import numpy as np
 from tqdm import tqdm
 from omegaconf import OmegaConf
 from typing import Tuple, List, Union
 import decord
@@ -12,6 +9,9 @@ import cv2
 from musetalk.utils.face_detection import FaceAlignment,LandmarksType
 from mmpose.apis import inference_topdown, init_model
 from mmpose.structures import merge_data_samples
 import torch
 import numpy as np
 from tqdm import tqdm
 import sys
 def fast_check_ffmpeg():
@@ -235,7 +235,6 @@ class Avatar:
                cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
            self.idx = self.idx + 1
    @torch.no_grad()
    def inference(self, audio_path, out_vid_name, fps, skip_save_images):
        os.makedirs(self.avatar_path + '/tmp', exist_ok=True)
        print("start inference")
Author	SHA1	Message	Date
NeRF-Factory	8795fa1425	feat: windows infer & gradio	2025-04-12 23:19:41 +08:00
zzzweakman	be656b199b	fix: dependencies	2025-04-12 01:40:40 +08:00
zzzweakman	b9b459a119	feat: v1.5 gradio for windows&linux	2025-04-11 02:43:04 +08:00
zzzweakman	2e5b74a257	docs: update readme	2025-04-10 14:02:24 +08:00
zzzweakman	a0834ec2c2	docs: update readme	2025-04-10 13:59:59 +08:00
zzzweakman	0702078902	fix: windows infer	2025-04-10 13:54:16 +08:00