@@ -555,7 +555,7 @@ def __call__(
555
555
self ,
556
556
prompt : Union [str , List [str ]],
557
557
negative_prompt : Optional [Union [str , List [str ]]] = None ,
558
- init_image : Union [torch .FloatTensor , PIL .Image .Image ] = None ,
558
+ image : Union [torch .FloatTensor , PIL .Image .Image ] = None ,
559
559
mask_image : Union [torch .FloatTensor , PIL .Image .Image ] = None ,
560
560
height : int = 512 ,
561
561
width : int = 512 ,
@@ -583,11 +583,11 @@ def __call__(
583
583
negative_prompt (`str` or `List[str]`, *optional*):
584
584
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
585
585
if `guidance_scale` is less than `1`).
586
- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
586
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
587
587
`Image`, or tensor representing an image batch, that will be used as the starting point for the
588
588
process.
589
589
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
590
- `Image`, or tensor representing an image batch, to mask `init_image `. White pixels in the mask will be
590
+ `Image`, or tensor representing an image batch, to mask `image `. White pixels in the mask will be
591
591
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
592
592
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
593
593
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -605,11 +605,11 @@ def __call__(
605
605
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
606
606
usually at the expense of lower image quality.
607
607
strength (`float`, *optional*, defaults to 0.8):
608
- Conceptually, indicates how much to transform the reference `init_image `. Must be between 0 and 1.
609
- `init_image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
608
+ Conceptually, indicates how much to transform the reference `image `. Must be between 0 and 1.
609
+ `image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
610
610
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
611
611
noise will be maximum and the denoising process will run for the full number of iterations specified in
612
- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image `.
612
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image `.
613
613
num_images_per_prompt (`int`, *optional*, defaults to 1):
614
614
The number of images to generate per prompt.
615
615
eta (`float`, *optional*, defaults to 0.0):
@@ -648,6 +648,9 @@ def __call__(
648
648
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
649
649
(nsfw) content, according to the `safety_checker`.
650
650
"""
651
+ message = "Please use `image` instead of `init_image`."
652
+ init_image = deprecate ("init_image" , "0.12.0" , message , take_from = kwargs )
653
+ image = init_image or image
651
654
652
655
if isinstance (prompt , str ):
653
656
batch_size = 1
@@ -714,7 +717,7 @@ def __call__(
714
717
mask = None
715
718
noise = None
716
719
717
- if init_image is None :
720
+ if image is None :
718
721
# get the initial random noise unless the user supplied it
719
722
720
723
# Unlike in other pipelines, latents need to be generated in the target device
@@ -753,11 +756,11 @@ def __call__(
753
756
# scale the initial noise by the standard deviation required by the scheduler
754
757
latents = latents * self .scheduler .init_noise_sigma
755
758
else :
756
- if isinstance (init_image , PIL .Image .Image ):
757
- init_image = preprocess_image (init_image )
759
+ if isinstance (image , PIL .Image .Image ):
760
+ image = preprocess_image (image )
758
761
# encode the init image into latents and scale the latents
759
- init_image = init_image .to (device = self .device , dtype = latents_dtype )
760
- init_latent_dist = self .vae .encode (init_image ).latent_dist
762
+ image = image .to (device = self .device , dtype = latents_dtype )
763
+ init_latent_dist = self .vae .encode (image ).latent_dist
761
764
init_latents = init_latent_dist .sample (generator = generator )
762
765
init_latents = 0.18215 * init_latents
763
766
init_latents = torch .cat ([init_latents ] * batch_size * num_images_per_prompt , dim = 0 )
@@ -772,7 +775,7 @@ def __call__(
772
775
773
776
# check sizes
774
777
if not mask .shape == init_latents .shape :
775
- raise ValueError ("The mask and init_image should be the same size!" )
778
+ raise ValueError ("The mask and image should be the same size!" )
776
779
777
780
# get the original timestep using init_timestep
778
781
offset = self .scheduler .config .get ("steps_offset" , 0 )
@@ -961,7 +964,7 @@ def text2img(
961
964
962
965
def img2img (
963
966
self ,
964
- init_image : Union [torch .FloatTensor , PIL .Image .Image ],
967
+ image : Union [torch .FloatTensor , PIL .Image .Image ],
965
968
prompt : Union [str , List [str ]],
966
969
negative_prompt : Optional [Union [str , List [str ]]] = None ,
967
970
strength : float = 0.8 ,
@@ -980,7 +983,7 @@ def img2img(
980
983
r"""
981
984
Function for image-to-image generation.
982
985
Args:
983
- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
986
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
984
987
`Image`, or tensor representing an image batch, that will be used as the starting point for the
985
988
process.
986
989
prompt (`str` or `List[str]`):
@@ -989,11 +992,11 @@ def img2img(
989
992
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
990
993
if `guidance_scale` is less than `1`).
991
994
strength (`float`, *optional*, defaults to 0.8):
992
- Conceptually, indicates how much to transform the reference `init_image `. Must be between 0 and 1.
993
- `init_image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
995
+ Conceptually, indicates how much to transform the reference `image `. Must be between 0 and 1.
996
+ `image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
994
997
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
995
998
noise will be maximum and the denoising process will run for the full number of iterations specified in
996
- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image `.
999
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image `.
997
1000
num_inference_steps (`int`, *optional*, defaults to 50):
998
1001
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
999
1002
expense of slower inference. This parameter will be modulated by `strength`.
@@ -1035,7 +1038,7 @@ def img2img(
1035
1038
return self .__call__ (
1036
1039
prompt = prompt ,
1037
1040
negative_prompt = negative_prompt ,
1038
- init_image = init_image ,
1041
+ image = image ,
1039
1042
num_inference_steps = num_inference_steps ,
1040
1043
guidance_scale = guidance_scale ,
1041
1044
strength = strength ,
@@ -1052,7 +1055,7 @@ def img2img(
1052
1055
1053
1056
def inpaint (
1054
1057
self ,
1055
- init_image : Union [torch .FloatTensor , PIL .Image .Image ],
1058
+ image : Union [torch .FloatTensor , PIL .Image .Image ],
1056
1059
mask_image : Union [torch .FloatTensor , PIL .Image .Image ],
1057
1060
prompt : Union [str , List [str ]],
1058
1061
negative_prompt : Optional [Union [str , List [str ]]] = None ,
@@ -1072,11 +1075,11 @@ def inpaint(
1072
1075
r"""
1073
1076
Function for inpaint.
1074
1077
Args:
1075
- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
1078
+ image (`torch.FloatTensor` or `PIL.Image.Image`):
1076
1079
`Image`, or tensor representing an image batch, that will be used as the starting point for the
1077
1080
process. This is the image whose masked region will be inpainted.
1078
1081
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
1079
- `Image`, or tensor representing an image batch, to mask `init_image `. White pixels in the mask will be
1082
+ `Image`, or tensor representing an image batch, to mask `image `. White pixels in the mask will be
1080
1083
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
1081
1084
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
1082
1085
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1088,7 +1091,7 @@ def inpaint(
1088
1091
strength (`float`, *optional*, defaults to 0.8):
1089
1092
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
1090
1093
is 1, the denoising process will be run on the masked area for the full number of iterations specified
1091
- in `num_inference_steps`. `init_image ` will be used as a reference for the masked area, adding more
1094
+ in `num_inference_steps`. `image ` will be used as a reference for the masked area, adding more
1092
1095
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
1093
1096
num_inference_steps (`int`, *optional*, defaults to 50):
1094
1097
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1131,7 +1134,7 @@ def inpaint(
1131
1134
return self .__call__ (
1132
1135
prompt = prompt ,
1133
1136
negative_prompt = negative_prompt ,
1134
- init_image = init_image ,
1137
+ image = image ,
1135
1138
mask_image = mask_image ,
1136
1139
num_inference_steps = num_inference_steps ,
1137
1140
guidance_scale = guidance_scale ,
0 commit comments