diff --git a/inference/huggingface/stable-diffusion/README.md b/inference/huggingface/stable-diffusion/README.md index 963d413b1c..af89a1db6f 100644 --- a/inference/huggingface/stable-diffusion/README.md +++ b/inference/huggingface/stable-diffusion/README.md @@ -11,6 +11,8 @@ pip install -r requirements.txt Examples can be run as follows:
deepspeed --num_gpus [number of GPUs] test-[model].py+NOTE: Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1`. + # Example Output Command:
diff --git a/inference/huggingface/stable-diffusion/test-stable-diffusion.py b/inference/huggingface/stable-diffusion/test-stable-diffusion.py index 8959341fdf..02bd2387ef 100644 --- a/inference/huggingface/stable-diffusion/test-stable-diffusion.py +++ b/inference/huggingface/stable-diffusion/test-stable-diffusion.py @@ -9,7 +9,7 @@ model = "prompthero/midjourney-v4-diffusion" local_rank = int(os.getenv("LOCAL_RANK", "0")) device = torch.device(f"cuda:{local_rank}") -world_size = int(os.getenv('WORLD_SIZE', '4')) +world_size = int(os.getenv('WORLD_SIZE', '1')) generator = torch.Generator(device=torch.cuda.current_device()) pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half) @@ -19,12 +19,14 @@ baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0] baseline_image.save(f"baseline.png") -# NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules +# NOTE: DeepSpeed inference supports local CUDA graphs for replaced SD modules. +# Local CUDA graphs for replaced SD modules will only be enabled when `mp_size==1` pipe = deepspeed.init_inference( pipe, + mp_size=world_size, dtype=torch.half, replace_with_kernel_inject=True, - enable_cuda_graph=True, + enable_cuda_graph=True if world_size==1 else False, ) generator.manual_seed(0xABEDABE7)