diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b37204e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,36 @@ +# The .dockerignore file excludes files from the container build process. +# +# https://docs.docker.com/engine/reference/builder/#dockerignore-file + +# Replicate +*.mp4 +*.pth +*.pt +*.bin +*.ckpt +*.onnx +*.tar +*.tar.gz +*.h5 +*.pb +*.caffemodel +*.weights +pretrained_weights/ +output/ +.venv/ +mlruns/ +data/ + +# Exclude Git files +.git +.github +.gitignore + +# Exclude Python cache files +__pycache__ +.mypy_cache +.pytest_cache +.ruff_cache + +# Exclude Python virtual environment +/venv diff --git a/.gitignore b/.gitignore index 569e4c1..25cc439 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,15 @@ data/ *.pth *.pt *.pkl -*.bin \ No newline at end of file +*.bin + +# Replicate +*.mp4 +*.ckpt +*.onnx +*.tar +*.tar.gz +*.h5 +*.pb +*.caffemodel +*.weights diff --git a/README.md b/README.md index 6887237..3987db3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # 🤗 Introduction +[![Replicate](https://replicate.com/zsxkib/moore-animateanyone/badge)](https://replicate.com/zsxkib/moore-animateanyone) + + **update** 🏋️🏋️🏋️ We release our training codes!! Now you can train your own AnimateAnyone models. See [here](#train) for more details. Have fun! **update**:🔥🔥🔥 We launch a HuggingFace Spaces demo of Moore-AnimateAnyone at [here](https://huggingface.co/spaces/xunsong/Moore-AnimateAnyone)!! diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..da9ac04 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,54 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + # set to true if your model requires a GPU + gpu: true + cuda: "12.1" + + # a list of ubuntu apt packages to install + # system_packages: + # - "libgl1-mesa-glx" + # - "libglib2.0-0" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.11" + + # a list of packages in the format == + python_packages: + - accelerate==0.21.0 + - av==11.0.0 + - decord==0.6.0 + - diffusers==0.24.0 + - einops==0.4.1 + - gradio<=3.50.2 + - imageio==2.33.0 + - imageio-ffmpeg==0.4.9 + - numpy>=1.23.5 + - omegaconf==2.2.3 + - onnxruntime-gpu==1.16.3 + - open-clip-torch==2.20.0 + - opencv-contrib-python==4.8.1.78 + - opencv-python==4.8.1.78 + - Pillow==9.5.0 + - scikit-image==0.21.0 + - scikit-learn==1.3.2 + - scipy==1.11.4 + - torch==2.0.1 + - torchdiffeq==0.2.3 + - torchmetrics==1.2.1 + - torchsde==0.2.5 + - torchvision==0.15.2 + - tqdm==4.66.1 + - transformers==4.30.2 + - mlflow==2.9.2 + - xformers==0.0.22 + - controlnet-aux==0.0.7 + - https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip + + # commands run after the environment is setup + run: + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.5.6/pget_linux_x86_64" && chmod +x /usr/local/bin/pget + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..bd83846 --- /dev/null +++ b/predict.py @@ -0,0 +1,240 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import subprocess +import time +from cog import BasePredictor, Input, Path +import os +from datetime import datetime + +import numpy as np +import torch +from diffusers import AutoencoderKL, DDIMScheduler +from einops import repeat +from omegaconf import OmegaConf +from PIL import Image +from torchvision import transforms +from transformers import CLIPVisionModelWithProjection + +from src.models.pose_guider import PoseGuider +from src.models.unet_2d_condition import UNet2DConditionModel +from src.models.unet_3d import UNet3DConditionModel +from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline +from src.utils.util import get_fps, read_frames, save_videos_grid + +MOORE_ANIMATEANYONE_CACHE = "./pretrained_weights" +MOORE_ANIMATEANYONE_URL = "https://storage.googleapis.com/replicate-weights/Moore-AnimateAnyone/pretrained_weights.tar" + + +class AnimateController: + def __init__( + self, + config_path="./configs/prompts/animation.yaml", + weight_dtype=torch.float16, + ): + # Read pretrained weights path from config + self.config = OmegaConf.load(config_path) + self.pipeline = None + self.weight_dtype = weight_dtype + + def animate( + self, + ref_image, + pose_video_path, + width=512, + height=768, + length=24, + num_inference_steps=25, + cfg=3.5, + seed=123, + ): + generator = torch.manual_seed(seed) + if isinstance(ref_image, np.ndarray): + ref_image = Image.fromarray(ref_image) + if self.pipeline is None: + vae = AutoencoderKL.from_pretrained( + self.config.pretrained_vae_path, + cache_dir=MOORE_ANIMATEANYONE_CACHE, + local_files_only=True, + ).to("cuda", dtype=self.weight_dtype) + + reference_unet = UNet2DConditionModel.from_pretrained( + self.config.pretrained_base_model_path, + subfolder="unet", + cache_dir=MOORE_ANIMATEANYONE_CACHE, + local_files_only=True, + ).to(dtype=self.weight_dtype, device="cuda") + + inference_config_path = self.config.inference_config + infer_config = OmegaConf.load(inference_config_path) + denoising_unet = UNet3DConditionModel.from_pretrained_2d( + self.config.pretrained_base_model_path, + self.config.motion_module_path, + subfolder="unet", + unet_additional_kwargs=infer_config.unet_additional_kwargs, + ).to(dtype=self.weight_dtype, device="cuda") + + pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to( + dtype=self.weight_dtype, device="cuda" + ) + + image_enc = CLIPVisionModelWithProjection.from_pretrained( + self.config.image_encoder_path, + cache_dir=MOORE_ANIMATEANYONE_CACHE, + local_files_only=True, + ).to(dtype=self.weight_dtype, device="cuda") + sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs) + scheduler = DDIMScheduler(**sched_kwargs) + + # load pretrained weights + denoising_unet.load_state_dict( + torch.load(self.config.denoising_unet_path, map_location="cpu"), + strict=False, + ) + reference_unet.load_state_dict( + torch.load(self.config.reference_unet_path, map_location="cpu"), + ) + pose_guider.load_state_dict( + torch.load(self.config.pose_guider_path, map_location="cpu"), + ) + + pipe = Pose2VideoPipeline( + vae=vae, + image_encoder=image_enc, + reference_unet=reference_unet, + denoising_unet=denoising_unet, + pose_guider=pose_guider, + scheduler=scheduler, + ) + pipe = pipe.to("cuda", dtype=self.weight_dtype) + self.pipeline = pipe + + pose_images = read_frames(pose_video_path) + src_fps = get_fps(pose_video_path) + + pose_list = [] + pose_tensor_list = [] + pose_transform = transforms.Compose( + [transforms.Resize((height, width)), transforms.ToTensor()] + ) + for pose_image_pil in pose_images[:length]: + pose_list.append(pose_image_pil) + pose_tensor_list.append(pose_transform(pose_image_pil)) + + video = self.pipeline( + ref_image, + pose_list, + width=width, + height=height, + video_length=length, + num_inference_steps=num_inference_steps, + guidance_scale=cfg, + generator=generator, + ).videos + + ref_image_tensor = pose_transform(ref_image) # (c, h, w) + ref_image_tensor = ref_image_tensor.unsqueeze(1).unsqueeze(0) # (1, c, 1, h, w) + ref_image_tensor = repeat( + ref_image_tensor, "b c f h w -> b c (repeat f) h w", repeat=length + ) + pose_tensor = torch.stack(pose_tensor_list, dim=0) # (f, c, h, w) + pose_tensor = pose_tensor.transpose(0, 1) + pose_tensor = pose_tensor.unsqueeze(0) + + # ref_image_tensor = ref_image_tensor[:, :3, :, :, :] + # video = torch.cat([ref_image_tensor, pose_tensor, video], dim=0) + + save_dir = f"./output/gradio" + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + date_str = datetime.now().strftime("%Y%m%d") + time_str = datetime.now().strftime("%H%M") + out_path = os.path.join(save_dir, f"{date_str}T{time_str}.mp4") + save_videos_grid( + video, + out_path, + n_rows=1, + fps=src_fps, + ) + + torch.cuda.empty_cache() + + return out_path + + +def download_weights(url, dest): + start = time.time() + print("downloading url: ", url) + print("downloading to: ", dest) + subprocess.check_call(["pget", "-x", url, dest], close_fds=False) + print("downloading took: ", time.time() - start) + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + if not os.path.exists(MOORE_ANIMATEANYONE_CACHE): + download_weights(MOORE_ANIMATEANYONE_URL, MOORE_ANIMATEANYONE_CACHE) + + self.controller = AnimateController() + + def predict( + self, + reference_image: Path = Input(description="Path to the reference image"), + motion_sequence: Path = Input(description="Path to the motion sequence video"), + width: int = Input( + description="Desired width of the output video", + default=512, + ge=448, + le=768, + ), + height: int = Input( + description="Desired height of the output video", + default=768, + ge=512, + le=1024, + ), + length: int = Input( + description="Desired length of the output video in frames", + default=24, + ge=24, + le=128, + ), + sampling_steps: int = Input( + description="Number of sampling steps for the animation", + default=25, + ge=10, + le=30, + ), + guidance_scale: float = Input( + description="Scale for guidance during animation generation", + default=3.5, + ge=2.0, + le=10.0, + ), + seed: int = Input( + description="Random seed. Leave blank to randomize the seed", default=None + ), + ) -> Path: + """Run a single prediction on the model""" + + if seed is None: + seed = int.from_bytes(os.urandom(2), "big") + print(f"Using seed: {seed}") + + reference_image = Image.open(str(reference_image)) + motion_sequence = str(motion_sequence) + + # Call the animate method from the controller + animation = self.controller.animate( + reference_image, + motion_sequence, + width, + height, + length, + sampling_steps, + guidance_scale, + seed, + ) + + return Path(animation)