Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# The .dockerignore file excludes files from the container build process.
#
# https://docs.docker.com/engine/reference/builder/#dockerignore-file

# Replicate
*.mp4
*.pth
*.pt
*.bin
*.ckpt
*.onnx
*.tar
*.tar.gz
*.h5
*.pb
*.caffemodel
*.weights
pretrained_weights/
output/
.venv/
mlruns/
data/

# Exclude Git files
.git
.github
.gitignore

# Exclude Python cache files
__pycache__
.mypy_cache
.pytest_cache
.ruff_cache

# Exclude Python virtual environment
/venv
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,15 @@ data/
*.pth
*.pt
*.pkl
*.bin
*.bin

# Replicate
*.mp4
*.ckpt
*.onnx
*.tar
*.tar.gz
*.h5
*.pb
*.caffemodel
*.weights
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# 🤗 Introduction

[![Replicate](https://replicate.com/zsxkib/moore-animateanyone/badge)](https://replicate.com/zsxkib/moore-animateanyone)


**update** 🏋️🏋️🏋️ We release our training codes!! Now you can train your own AnimateAnyone models. See [here](#train) for more details. Have fun!

**update**:🔥🔥🔥 We launch a HuggingFace Spaces demo of Moore-AnimateAnyone at [here](https://huggingface.co/spaces/xunsong/Moore-AnimateAnyone)!!
Expand Down
54 changes: 54 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md

build:
# set to true if your model requires a GPU
gpu: true
cuda: "12.1"

# a list of ubuntu apt packages to install
# system_packages:
# - "libgl1-mesa-glx"
# - "libglib2.0-0"

# python version in the form '3.11' or '3.11.4'
python_version: "3.11"

# a list of packages in the format <package-name>==<version>
python_packages:
- accelerate==0.21.0
- av==11.0.0
- decord==0.6.0
- diffusers==0.24.0
- einops==0.4.1
- gradio<=3.50.2
- imageio==2.33.0
- imageio-ffmpeg==0.4.9
- numpy>=1.23.5
- omegaconf==2.2.3
- onnxruntime-gpu==1.16.3
- open-clip-torch==2.20.0
- opencv-contrib-python==4.8.1.78
- opencv-python==4.8.1.78
- Pillow==9.5.0
- scikit-image==0.21.0
- scikit-learn==1.3.2
- scipy==1.11.4
- torch==2.0.1
- torchdiffeq==0.2.3
- torchmetrics==1.2.1
- torchsde==0.2.5
- torchvision==0.15.2
- tqdm==4.66.1
- transformers==4.30.2
- mlflow==2.9.2
- xformers==0.0.22
- controlnet-aux==0.0.7
- https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip

# commands run after the environment is setup
run:
- curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.5.6/pget_linux_x86_64" && chmod +x /usr/local/bin/pget

# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
240 changes: 240 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md

import subprocess
import time
from cog import BasePredictor, Input, Path
import os
from datetime import datetime

import numpy as np
import torch
from diffusers import AutoencoderKL, DDIMScheduler
from einops import repeat
from omegaconf import OmegaConf
from PIL import Image
from torchvision import transforms
from transformers import CLIPVisionModelWithProjection

from src.models.pose_guider import PoseGuider
from src.models.unet_2d_condition import UNet2DConditionModel
from src.models.unet_3d import UNet3DConditionModel
from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
from src.utils.util import get_fps, read_frames, save_videos_grid

MOORE_ANIMATEANYONE_CACHE = "./pretrained_weights"
MOORE_ANIMATEANYONE_URL = "https://storage.googleapis.com/replicate-weights/Moore-AnimateAnyone/pretrained_weights.tar"


class AnimateController:
def __init__(
self,
config_path="./configs/prompts/animation.yaml",
weight_dtype=torch.float16,
):
# Read pretrained weights path from config
self.config = OmegaConf.load(config_path)
self.pipeline = None
self.weight_dtype = weight_dtype

def animate(
self,
ref_image,
pose_video_path,
width=512,
height=768,
length=24,
num_inference_steps=25,
cfg=3.5,
seed=123,
):
generator = torch.manual_seed(seed)
if isinstance(ref_image, np.ndarray):
ref_image = Image.fromarray(ref_image)
if self.pipeline is None:
vae = AutoencoderKL.from_pretrained(
self.config.pretrained_vae_path,
cache_dir=MOORE_ANIMATEANYONE_CACHE,
local_files_only=True,
).to("cuda", dtype=self.weight_dtype)

reference_unet = UNet2DConditionModel.from_pretrained(
self.config.pretrained_base_model_path,
subfolder="unet",
cache_dir=MOORE_ANIMATEANYONE_CACHE,
local_files_only=True,
).to(dtype=self.weight_dtype, device="cuda")

inference_config_path = self.config.inference_config
infer_config = OmegaConf.load(inference_config_path)
denoising_unet = UNet3DConditionModel.from_pretrained_2d(
self.config.pretrained_base_model_path,
self.config.motion_module_path,
subfolder="unet",
unet_additional_kwargs=infer_config.unet_additional_kwargs,
).to(dtype=self.weight_dtype, device="cuda")

pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(
dtype=self.weight_dtype, device="cuda"
)

image_enc = CLIPVisionModelWithProjection.from_pretrained(
self.config.image_encoder_path,
cache_dir=MOORE_ANIMATEANYONE_CACHE,
local_files_only=True,
).to(dtype=self.weight_dtype, device="cuda")
sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
scheduler = DDIMScheduler(**sched_kwargs)

# load pretrained weights
denoising_unet.load_state_dict(
torch.load(self.config.denoising_unet_path, map_location="cpu"),
strict=False,
)
reference_unet.load_state_dict(
torch.load(self.config.reference_unet_path, map_location="cpu"),
)
pose_guider.load_state_dict(
torch.load(self.config.pose_guider_path, map_location="cpu"),
)

pipe = Pose2VideoPipeline(
vae=vae,
image_encoder=image_enc,
reference_unet=reference_unet,
denoising_unet=denoising_unet,
pose_guider=pose_guider,
scheduler=scheduler,
)
pipe = pipe.to("cuda", dtype=self.weight_dtype)
self.pipeline = pipe

pose_images = read_frames(pose_video_path)
src_fps = get_fps(pose_video_path)

pose_list = []
pose_tensor_list = []
pose_transform = transforms.Compose(
[transforms.Resize((height, width)), transforms.ToTensor()]
)
for pose_image_pil in pose_images[:length]:
pose_list.append(pose_image_pil)
pose_tensor_list.append(pose_transform(pose_image_pil))

video = self.pipeline(
ref_image,
pose_list,
width=width,
height=height,
video_length=length,
num_inference_steps=num_inference_steps,
guidance_scale=cfg,
generator=generator,
).videos

ref_image_tensor = pose_transform(ref_image) # (c, h, w)
ref_image_tensor = ref_image_tensor.unsqueeze(1).unsqueeze(0) # (1, c, 1, h, w)
ref_image_tensor = repeat(
ref_image_tensor, "b c f h w -> b c (repeat f) h w", repeat=length
)
pose_tensor = torch.stack(pose_tensor_list, dim=0) # (f, c, h, w)
pose_tensor = pose_tensor.transpose(0, 1)
pose_tensor = pose_tensor.unsqueeze(0)

# ref_image_tensor = ref_image_tensor[:, :3, :, :, :]
# video = torch.cat([ref_image_tensor, pose_tensor, video], dim=0)

save_dir = f"./output/gradio"
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
date_str = datetime.now().strftime("%Y%m%d")
time_str = datetime.now().strftime("%H%M")
out_path = os.path.join(save_dir, f"{date_str}T{time_str}.mp4")
save_videos_grid(
video,
out_path,
n_rows=1,
fps=src_fps,
)

torch.cuda.empty_cache()

return out_path


def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
if not os.path.exists(MOORE_ANIMATEANYONE_CACHE):
download_weights(MOORE_ANIMATEANYONE_URL, MOORE_ANIMATEANYONE_CACHE)

self.controller = AnimateController()

def predict(
self,
reference_image: Path = Input(description="Path to the reference image"),
motion_sequence: Path = Input(description="Path to the motion sequence video"),
width: int = Input(
description="Desired width of the output video",
default=512,
ge=448,
le=768,
),
height: int = Input(
description="Desired height of the output video",
default=768,
ge=512,
le=1024,
),
length: int = Input(
description="Desired length of the output video in frames",
default=24,
ge=24,
le=128,
),
sampling_steps: int = Input(
description="Number of sampling steps for the animation",
default=25,
ge=10,
le=30,
),
guidance_scale: float = Input(
description="Scale for guidance during animation generation",
default=3.5,
ge=2.0,
le=10.0,
),
seed: int = Input(
description="Random seed. Leave blank to randomize the seed", default=None
),
) -> Path:
"""Run a single prediction on the model"""

if seed is None:
seed = int.from_bytes(os.urandom(2), "big")
print(f"Using seed: {seed}")

reference_image = Image.open(str(reference_image))
motion_sequence = str(motion_sequence)

# Call the animate method from the controller
animation = self.controller.animate(
reference_image,
motion_sequence,
width,
height,
length,
sampling_steps,
guidance_scale,
seed,
)

return Path(animation)