Skip to content

Commit 52eb034

Browse files
authored
Standardize on using image argument in all pipelines (huggingface#1361)
* feat: switch core pipelines to use image arg * test: update tests for core pipelines * feat: switch examples to use image arg * docs: update docs to use image arg * style: format code using black and doc-builder * fix: deprecate use of init_image in all pipelines
1 parent 2bbf8b6 commit 52eb034

27 files changed

+264
-223
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512))
280280

281281
prompt = "A fantasy landscape, trending on artstation"
282282

283-
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
283+
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
284284

285285
images[0].save("fantasy_landscape.png")
286286
```

docs/source/api/pipelines/cycle_diffusion.mdx

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant"
5757
image = pipe(
5858
prompt=prompt,
5959
source_prompt=source_prompt,
60-
init_image=init_image,
60+
image=init_image,
6161
num_inference_steps=100,
6262
eta=0.1,
6363
strength=0.8,
@@ -83,7 +83,7 @@ torch.manual_seed(0)
8383
image = pipe(
8484
prompt=prompt,
8585
source_prompt=source_prompt,
86-
init_image=init_image,
86+
image=init_image,
8787
num_inference_steps=100,
8888
eta=0.1,
8989
strength=0.85,

docs/source/api/pipelines/overview.mdx

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512))
149149

150150
prompt = "A fantasy landscape, trending on artstation"
151151

152-
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
152+
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
153153

154154
images[0].save("fantasy_landscape.png")
155155
```

docs/source/using-diffusers/custom_pipeline_examples.mdx

+2-2
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ init_image = download_image(
177177

178178
prompt = "A fantasy landscape, trending on artstation"
179179

180-
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
180+
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
181181

182182
### Inpainting
183183

@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
187187
mask_image = download_image(mask_url).resize((512, 512))
188188

189189
prompt = "a cat sitting on a bench"
190-
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
190+
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
191191
```
192192

193193
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.

docs/source/using-diffusers/img2img.mdx

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768))
3737

3838
prompt = "A fantasy landscape, trending on artstation"
3939

40-
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
40+
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
4141

4242
images[0].save("fantasy_landscape.png")
4343
```

examples/community/README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di
166166

167167
prompt = "A fantasy landscape, trending on artstation"
168168

169-
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
169+
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
170170

171171
### Inpainting
172172

@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512))
176176
mask_image = download_image(mask_url).resize((512, 512))
177177

178178
prompt = "a cat sitting on a bench"
179-
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
179+
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
180180
```
181181

182182
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB")
420420
init_image = init_image.resize((512, 512))
421421
res = pipe.train(
422422
prompt,
423-
init_image,
423+
image=init_image,
424424
guidance_scale=7.5,
425425
num_inference_steps=50,
426426
generator=generator)

examples/community/imagic_stable_diffusion.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
1818
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
1919
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
20-
from diffusers.utils import logging
20+
from diffusers.utils import deprecate, logging
2121

2222
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
2323
from packaging import version
@@ -133,7 +133,7 @@ def disable_attention_slicing(self):
133133
def train(
134134
self,
135135
prompt: Union[str, List[str]],
136-
init_image: Union[torch.FloatTensor, PIL.Image.Image],
136+
image: Union[torch.FloatTensor, PIL.Image.Image],
137137
height: Optional[int] = 512,
138138
width: Optional[int] = 512,
139139
generator: Optional[torch.Generator] = None,
@@ -184,6 +184,10 @@ def train(
184184
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
185185
(nsfw) content, according to the `safety_checker`.
186186
"""
187+
message = "Please use `image` instead of `init_image`."
188+
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
189+
image = init_image or image
190+
187191
accelerator = Accelerator(
188192
gradient_accumulation_steps=1,
189193
mixed_precision="fp16",
@@ -241,14 +245,14 @@ def train(
241245
lr=embedding_learning_rate,
242246
)
243247

244-
if isinstance(init_image, PIL.Image.Image):
245-
init_image = preprocess(init_image)
248+
if isinstance(image, PIL.Image.Image):
249+
image = preprocess(image)
246250

247251
latents_dtype = text_embeddings.dtype
248-
init_image = init_image.to(device=self.device, dtype=latents_dtype)
249-
init_latent_image_dist = self.vae.encode(init_image).latent_dist
250-
init_image_latents = init_latent_image_dist.sample(generator=generator)
251-
init_image_latents = 0.18215 * init_image_latents
252+
image = image.to(device=self.device, dtype=latents_dtype)
253+
init_latent_image_dist = self.vae.encode(image).latent_dist
254+
image_latents = init_latent_image_dist.sample(generator=generator)
255+
image_latents = 0.18215 * image_latents
252256

253257
progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
254258
progress_bar.set_description("Steps")
@@ -259,12 +263,12 @@ def train(
259263
for _ in range(text_embedding_optimization_steps):
260264
with accelerator.accumulate(text_embeddings):
261265
# Sample noise that we'll add to the latents
262-
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
263-
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
266+
noise = torch.randn(image_latents.shape).to(image_latents.device)
267+
timesteps = torch.randint(1000, (1,), device=image_latents.device)
264268

265269
# Add noise to the latents according to the noise magnitude at each timestep
266270
# (this is the forward diffusion process)
267-
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
271+
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
268272

269273
# Predict the noise residual
270274
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
@@ -301,12 +305,12 @@ def train(
301305
for _ in range(model_fine_tuning_optimization_steps):
302306
with accelerator.accumulate(self.unet.parameters()):
303307
# Sample noise that we'll add to the latents
304-
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
305-
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
308+
noise = torch.randn(image_latents.shape).to(image_latents.device)
309+
timesteps = torch.randint(1000, (1,), device=image_latents.device)
306310

307311
# Add noise to the latents according to the noise magnitude at each timestep
308312
# (this is the forward diffusion process)
309-
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
313+
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
310314

311315
# Predict the noise residual
312316
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample

examples/community/lpw_stable_diffusion.py

+26-23
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ def __call__(
555555
self,
556556
prompt: Union[str, List[str]],
557557
negative_prompt: Optional[Union[str, List[str]]] = None,
558-
init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
558+
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
559559
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
560560
height: int = 512,
561561
width: int = 512,
@@ -583,11 +583,11 @@ def __call__(
583583
negative_prompt (`str` or `List[str]`, *optional*):
584584
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
585585
if `guidance_scale` is less than `1`).
586-
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
586+
image (`torch.FloatTensor` or `PIL.Image.Image`):
587587
`Image`, or tensor representing an image batch, that will be used as the starting point for the
588588
process.
589589
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
590-
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
590+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
591591
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
592592
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
593593
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -605,11 +605,11 @@ def __call__(
605605
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
606606
usually at the expense of lower image quality.
607607
strength (`float`, *optional*, defaults to 0.8):
608-
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
609-
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
608+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
609+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
610610
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
611611
noise will be maximum and the denoising process will run for the full number of iterations specified in
612-
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
612+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
613613
num_images_per_prompt (`int`, *optional*, defaults to 1):
614614
The number of images to generate per prompt.
615615
eta (`float`, *optional*, defaults to 0.0):
@@ -648,6 +648,9 @@ def __call__(
648648
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
649649
(nsfw) content, according to the `safety_checker`.
650650
"""
651+
message = "Please use `image` instead of `init_image`."
652+
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
653+
image = init_image or image
651654

652655
if isinstance(prompt, str):
653656
batch_size = 1
@@ -714,7 +717,7 @@ def __call__(
714717
mask = None
715718
noise = None
716719

717-
if init_image is None:
720+
if image is None:
718721
# get the initial random noise unless the user supplied it
719722

720723
# Unlike in other pipelines, latents need to be generated in the target device
@@ -753,11 +756,11 @@ def __call__(
753756
# scale the initial noise by the standard deviation required by the scheduler
754757
latents = latents * self.scheduler.init_noise_sigma
755758
else:
756-
if isinstance(init_image, PIL.Image.Image):
757-
init_image = preprocess_image(init_image)
759+
if isinstance(image, PIL.Image.Image):
760+
image = preprocess_image(image)
758761
# encode the init image into latents and scale the latents
759-
init_image = init_image.to(device=self.device, dtype=latents_dtype)
760-
init_latent_dist = self.vae.encode(init_image).latent_dist
762+
image = image.to(device=self.device, dtype=latents_dtype)
763+
init_latent_dist = self.vae.encode(image).latent_dist
761764
init_latents = init_latent_dist.sample(generator=generator)
762765
init_latents = 0.18215 * init_latents
763766
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
@@ -772,7 +775,7 @@ def __call__(
772775

773776
# check sizes
774777
if not mask.shape == init_latents.shape:
775-
raise ValueError("The mask and init_image should be the same size!")
778+
raise ValueError("The mask and image should be the same size!")
776779

777780
# get the original timestep using init_timestep
778781
offset = self.scheduler.config.get("steps_offset", 0)
@@ -961,7 +964,7 @@ def text2img(
961964

962965
def img2img(
963966
self,
964-
init_image: Union[torch.FloatTensor, PIL.Image.Image],
967+
image: Union[torch.FloatTensor, PIL.Image.Image],
965968
prompt: Union[str, List[str]],
966969
negative_prompt: Optional[Union[str, List[str]]] = None,
967970
strength: float = 0.8,
@@ -980,7 +983,7 @@ def img2img(
980983
r"""
981984
Function for image-to-image generation.
982985
Args:
983-
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
986+
image (`torch.FloatTensor` or `PIL.Image.Image`):
984987
`Image`, or tensor representing an image batch, that will be used as the starting point for the
985988
process.
986989
prompt (`str` or `List[str]`):
@@ -989,11 +992,11 @@ def img2img(
989992
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
990993
if `guidance_scale` is less than `1`).
991994
strength (`float`, *optional*, defaults to 0.8):
992-
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
993-
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
995+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
996+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
994997
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
995998
noise will be maximum and the denoising process will run for the full number of iterations specified in
996-
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
999+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
9971000
num_inference_steps (`int`, *optional*, defaults to 50):
9981001
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
9991002
expense of slower inference. This parameter will be modulated by `strength`.
@@ -1035,7 +1038,7 @@ def img2img(
10351038
return self.__call__(
10361039
prompt=prompt,
10371040
negative_prompt=negative_prompt,
1038-
init_image=init_image,
1041+
image=image,
10391042
num_inference_steps=num_inference_steps,
10401043
guidance_scale=guidance_scale,
10411044
strength=strength,
@@ -1052,7 +1055,7 @@ def img2img(
10521055

10531056
def inpaint(
10541057
self,
1055-
init_image: Union[torch.FloatTensor, PIL.Image.Image],
1058+
image: Union[torch.FloatTensor, PIL.Image.Image],
10561059
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
10571060
prompt: Union[str, List[str]],
10581061
negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -1072,11 +1075,11 @@ def inpaint(
10721075
r"""
10731076
Function for inpaint.
10741077
Args:
1075-
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
1078+
image (`torch.FloatTensor` or `PIL.Image.Image`):
10761079
`Image`, or tensor representing an image batch, that will be used as the starting point for the
10771080
process. This is the image whose masked region will be inpainted.
10781081
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
1079-
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
1082+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
10801083
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
10811084
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
10821085
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1088,7 +1091,7 @@ def inpaint(
10881091
strength (`float`, *optional*, defaults to 0.8):
10891092
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
10901093
is 1, the denoising process will be run on the masked area for the full number of iterations specified
1091-
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
1094+
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
10921095
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
10931096
num_inference_steps (`int`, *optional*, defaults to 50):
10941097
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1131,7 +1134,7 @@ def inpaint(
11311134
return self.__call__(
11321135
prompt=prompt,
11331136
negative_prompt=negative_prompt,
1134-
init_image=init_image,
1137+
image=image,
11351138
mask_image=mask_image,
11361139
num_inference_steps=num_inference_steps,
11371140
guidance_scale=guidance_scale,

0 commit comments

Comments
 (0)