diff --git a/annotator/uniformer/__init__.py b/annotator/uniformer/__init__.py index 3364d40997..e6ff24f630 100644 --- a/annotator/uniformer/__init__.py +++ b/annotator/uniformer/__init__.py @@ -18,10 +18,12 @@ def __init__(self): if not os.path.exists(modelpath): from basicsr.utils.download_util import load_file_from_url load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path) + # config.py(默认ADE20K) config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py") self.model = init_segmentor(config_file, modelpath).cuda() def __call__(self, img): result = inference_segmentor(self.model, img) + # palette: cityscapes, ade, voc res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1) return res_img diff --git a/annotator/uniformer/exp/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py b/annotator/uniformer/exp/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py new file mode 100644 index 0000000000..28d756873a --- /dev/null +++ b/annotator/uniformer/exp/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py @@ -0,0 +1,5 @@ +_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py'] + +model = dict( + backbone=dict( + depth=101)) \ No newline at end of file diff --git a/annotator/uniformer/exp/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py b/annotator/uniformer/exp/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py new file mode 100644 index 0000000000..c62a6e900b --- /dev/null +++ b/annotator/uniformer/exp/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py @@ -0,0 +1,183 @@ +_base_ = ['../../configs/_base_/default_runtime.py', '../../configs/_base_/datasets/cityscapes.py'] + +crop_size = (512, 1024) +num_classes = 19 +model = dict( + type='EncoderDecoder', + backbone=dict( + type='ResNetV1c', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='SyncBN', requires_grad=False), + style='pytorch'), + decode_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_transformer_feat_level=3, + align_corners=False, + pixel_decoder=dict( + type='mmdet.MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=64, + dropout=0.0, + batch_first=True, + norm_cfg=None, + init_cfg=None), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True))), + init_cfg=None), + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True), + init_cfg=None), + enforce_decoder_input_project=False, + positional_encoding=dict( # SinePositionalEncoding + num_feats=128, normalize=True), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=None, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0.0, + dropout_layer=None, + add_identity=True)), + init_cfg=None), + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='mmdet.DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='mmdet.HungarianAssigner', + match_costs=[ + dict(type='mmdet.ClassificationCost', weight=2.0), + dict( + type='mmdet.CrossEntropyLossCost', + weight=5.0, + use_sigmoid=True), + dict( + type='mmdet.DiceCost', + weight=5.0, + pred_act=True, + eps=1.0) + ]), + sampler=dict(type='mmdet.MaskPseudoSampler'))), + train_cfg=dict()) + +# dataset config +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict( + type='RandomChoiceResize', + scales=[int(1024 * x * 0.1) for x in range(5, 21)], + resize_type='ResizeShortestEdge', + max_size=4096), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackSegInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# optimizer +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optimizer = dict( + type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) +optim_wrapper = dict( + type='OptimWrapper', + optimizer=optimizer, + clip_grad=dict(max_norm=0.01, norm_type=2), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0)) +# learning policy +param_scheduler = [ + dict( + type='PolyLR', + eta_min=0, + power=0.9, + begin=0, + end=90000, + by_epoch=False) +] + +# training schedule for 90k +train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=5000) +val_cfg = dict(type='ValLoop') +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', by_epoch=False, interval=5000, + save_best='mIoU'), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='SegVisualizationHook')) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) \ No newline at end of file diff --git a/annotator/uniformer/exp/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/annotator/uniformer/exp/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py new file mode 100644 index 0000000000..05ff96b8ea --- /dev/null +++ b/annotator/uniformer/exp/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py @@ -0,0 +1,42 @@ +_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py'] +pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + pretrain_img_size=384, + embed_dims=192, + depths=depths, + num_heads=[6, 12, 24, 48], + window_size=12, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + decode_head=dict(in_channels=[192, 384, 768, 1536])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) \ No newline at end of file diff --git a/annotator/uniformer/exp/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py b/annotator/uniformer/exp/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py new file mode 100644 index 0000000000..6d7936b3fa --- /dev/null +++ b/annotator/uniformer/exp/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py @@ -0,0 +1,52 @@ +_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py'] +pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth' # noqa +depths = [2, 2, 6, 2] +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + decode_head=dict(in_channels=[96, 192, 384, 768])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) \ No newline at end of file diff --git a/cldm/cldm.py b/cldm/cldm.py index 0b3ac7a575..febc615e76 100644 --- a/cldm/cldm.py +++ b/cldm/cldm.py @@ -331,10 +331,21 @@ def apply_model(self, x_noisy, t, cond, *args, **kwargs): cond_txt = torch.cat(cond['c_crossattn'], 1) + # !!! + # obstruction_txt = None + # if cond['obstruction_c_crossattn'] != None: + # obstruction_txt = torch.cat(cond['obstruction_c_crossattn'], 1) + if cond['c_concat'] is None: eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=None, only_mid_control=self.only_mid_control) else: + # !!! control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt) + # if obstruction_txt != None: + # control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=obstruction_txt) + # else: + # control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt) + control = [c * scale for c, scale in zip(control, self.control_scales)] eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control) diff --git a/control_sd.py b/control_sd.py new file mode 100644 index 0000000000..12be31be4b --- /dev/null +++ b/control_sd.py @@ -0,0 +1,140 @@ +import cv2 +import einops +import numpy as np +import torch +import random +# import gradio as gr +from pytorch_lightning import seed_everything +from ControlNet.annotator.util import resize_image, HWC3 +from ControlNet.cldm.model import create_model, load_state_dict +from ControlNet.cldm.ddim_hacked import DDIMSampler +# import config as config +save_memory = False + + +class ControlSD: + def __init__(self, model_config, checkpoint_path): + self.model = create_model(model_config).cpu() + self.model.load_state_dict(load_state_dict(checkpoint_path, location='cuda')) + self.model = self.model.cuda() + self.ddim_sampler = DDIMSampler(self.model) + + def process(self, detected_map, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, + ddim_steps, guess_mode, strength, scale, seed, eta): + print(detected_map.shape, "detected_map.shape") + print(prompt, "prompt") + print(a_prompt, "a_prompt") + print(n_prompt, "n_prompt") + print(num_samples, type(num_samples), "num_samples") + print(image_resolution, type(image_resolution), "image_resolution") + print(detect_resolution, type(detect_resolution), "detect_resolution") + print(ddim_steps, type(ddim_steps), "ddim_steps") + print(guess_mode, type(guess_mode), "guess_mode") + print(strength, type(strength), "strength") + print(scale, type(scale), "scale") + print(seed, type(seed), "seed") + print(eta, type(eta), "eta") + print("111") + with torch.no_grad(): + detected_map = HWC3(detected_map) + img = resize_image(detected_map, image_resolution) + H, W, C = img.shape + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + print("222") + control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 + control = torch.stack([control for _ in range(num_samples)], dim=0) + control = einops.rearrange(control, 'b h w c -> b c h w').clone() + print("333") + if seed == -1: + seed = random.randint(0, 65535) + seed_everything(seed) + print("444") + if save_memory: + self.model.low_vram_shift(is_diffusing=False) + print("555") + cond = { + "c_concat": [control], + "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)] + } + print("666") + un_cond = { + "c_concat": None if guess_mode else [control], + "c_crossattn": [self.model.get_learned_conditioning([n_prompt] * num_samples)] + } + print("777") + shape = (4, H // 8, W // 8) + print("888") + if save_memory: + self.model.low_vram_shift(is_diffusing=True) + + self.model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) + + samples, _ = self.ddim_sampler.sample(ddim_steps, num_samples, shape, cond, verbose=False, eta=eta, + unconditional_guidance_scale=scale, unconditional_conditioning=un_cond) + print("999") + if save_memory: + self.model.low_vram_shift(is_diffusing=False) + print("1010") + x_samples = self.model.decode_first_stage(samples) + x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) + print("1111") + results = [x_samples[i] for i in range(num_samples)] + return [detected_map] + results + + +def create_control_sd(): + # model_config = '/home/data2/yangsp22/code/ControlNet/models/cldm_v21.yaml' + # checkpoint_path = '/home/data2/yangsp22/code/finetune-output/sam-new/BDD-all/weights/lightning_logs/version_1/checkpoints/epoch=9-step=35139.ckpt' + + model_config = '/home/tmp/workspace/diffusion_app/backend/ControlNet/models/cldm_v21.yaml' + checkpoint_path = '/home/tmp/workspace/diffusion_app/backend/epoch=9-step=35139.ckpt' + + # model = create_model('/home/tmp/workspace/diffusion_app/backend/ControlNet/models/cldm_v21.yaml').cpu() + # model.load_state_dict(load_state_dict('/home/tmp/workspace/diffusion_app/backend/epoch=9-step=35139.ckpt', location='cuda')) + return ControlSD(model_config, checkpoint_path) + + +control_sd_instance = create_control_sd() +process = control_sd_instance.process + + +# def create_gradio_interface(control_sd): +# block = gr.Blocks().queue() +# with block: +# with gr.Row(): +# gr.Markdown("## Control Stable Diffusion with Segmentation Maps (Mask input)") +# with gr.Row(): +# with gr.Column(): +# input_image = gr.Image(source='upload', type="numpy") +# prompt = gr.Textbox(label="Prompt") +# run_button = gr.Button(label="Run") +# with gr.Accordion("Advanced options", open=False): +# num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) +# image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) +# strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) +# guess_mode = gr.Checkbox(label='Guess Mode', value=False) +# detect_resolution = gr.Slider(label="Segmentation Resolution", minimum=128, maximum=1024, value=512, step=1) +# ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) +# scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) +# seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) +# eta = gr.Number(label="eta (DDIM)", value=0.0) +# a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') +# n_prompt = gr.Textbox(label="Negative Prompt", +# value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') +# with gr.Column(): +# result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') + +# ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] +# run_button.click(fn=control_sd.process, inputs=ips, outputs=[result_gallery]) + +# return block + + +def main(): + control_sd = create_control_sd() + block = create_gradio_interface(control_sd) + block.launch(server_name='0.0.0.0') + + +if __name__ == "__main__": + main() diff --git a/finetune/nohup.out b/finetune/nohup.out new file mode 100644 index 0000000000..1b4e41d174 --- /dev/null +++ b/finetune/nohup.out @@ -0,0 +1,9456 @@ +GPU available: True, used: True +TPU available: False, using: 0 TPU cores +IPU available: False, using: 0 IPUs +/home/yangsp22/.conda/envs/control/lib/python3.8/site-packages/pytorch_lightning/trainer/configuration_validator.py:118: UserWarning: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop. + rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.") +/home/yangsp22/.conda/envs/control/lib/python3.8/site-packages/pytorch_lightning/trainer/configuration_validator.py:280: LightningDeprecationWarning: Base `LightningModule.on_train_batch_start` hook signature has changed in v1.5. The `dataloader_idx` argument will be removed in v1.7. + rank_zero_deprecation( +/home/yangsp22/.conda/envs/control/lib/python3.8/site-packages/pytorch_lightning/trainer/configuration_validator.py:287: LightningDeprecationWarning: Base `Callback.on_train_batch_end` hook signature has changed in v1.5. The `dataloader_idx` argument will be removed in v1.7. + rank_zero_deprecation( +initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 +initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4 +initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4 +initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4 +---------------------------------------------------------------------------------------------------- +distributed_backend=nccl +All distributed processes registered. Starting with 4 processes +---------------------------------------------------------------------------------------------------- + +LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [2,4,6,7] +LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [2,4,6,7] +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [2,4,6,7] +LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [2,4,6,7] + + | Name | Type | Params +------------------------------------------------------------- +0 | model | DiffusionWrapper | 865 M +1 | first_stage_model | AutoencoderKL | 83.7 M +2 | cond_stage_model | FrozenOpenCLIPEmbedder | 354 M +3 | control_model | ControlNet | 364 M +------------------------------------------------------------- +1.2 B Trainable params +437 M Non-trainable params +1.7 B Total params +6,671.302 Total estimated model params size (MB) +No module 'xformers'. Proceeding without it. +ControlLDM: Running in eps-prediction mode +DiffusionWrapper has 865.91 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Loaded model config from [/home/data2/yangsp22/code/ControlNet/models/cldm_v21.yaml] +Loaded state_dict from [/home/data2/yangsp22/code/ControlNet/models/control_sd21_ini.ckpt] + Training: 0it [00:00, ?it/s] Training: 0%| | 0/1208 [00:00 /home/data2/yangsp22/code/finetune-output/sam-new/CODA-all/output2.log 2>&1 & + +# Configs +resume_path = '/home/data2/yangsp22/code/ControlNet/models/control_sd21_ini.ckpt' +batch_size = 1 +accumulate_grad_batches = 4 +logger_freq = 300 +learning_rate = 1e-5 +max_steps = 20000 #训练多少步后停止,-1表示没有限制,一步指的是一次学习batch_size个样本 # 9000steps +max_epochs = -1 #训练多少轮后停止,-1表示没有限制,一轮指的是每个样本都学习过一次了 # 8epochs +sd_locked = True +only_mid_control = False +weights_save_path = '/home/data2/yangsp22/code/finetune-output/sam-new/CODA-all/weights' +default_root_dir = '/home/data2/yangsp22/code/finetune-output/sam-new/CODA-all' + + +# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs. +model = create_model('/home/data2/yangsp22/code/ControlNet/models/cldm_v21.yaml').cpu() +model.load_state_dict(load_state_dict(resume_path, location='cpu')) +model.learning_rate = learning_rate +model.sd_locked = sd_locked +model.only_mid_control = only_mid_control + + +# Misc +dataset = MyDataset() +dataloader = DataLoader(dataset, num_workers=4, pin_memory=True, batch_size=batch_size, shuffle=True) +logger = ImageLogger(batch_frequency=logger_freq) +trainer = pl.Trainer(gpus=5, precision=32, callbacks=[logger], accumulate_grad_batches=accumulate_grad_batches, + max_steps=max_steps, max_epochs=max_epochs, + weights_save_path=weights_save_path, default_root_dir=default_root_dir, strategy="ddp") + + +# Train! +# if __name__ == '__main__': +trainer.fit(model, dataloader) diff --git a/gradio_annotator.py b/gradio_annotator.py deleted file mode 100644 index 2b1a29ebbe..0000000000 --- a/gradio_annotator.py +++ /dev/null @@ -1,160 +0,0 @@ -import gradio as gr - -from annotator.util import resize_image, HWC3 - - -model_canny = None - - -def canny(img, res, l, h): - img = resize_image(HWC3(img), res) - global model_canny - if model_canny is None: - from annotator.canny import CannyDetector - model_canny = CannyDetector() - result = model_canny(img, l, h) - return [result] - - -model_hed = None - - -def hed(img, res): - img = resize_image(HWC3(img), res) - global model_hed - if model_hed is None: - from annotator.hed import HEDdetector - model_hed = HEDdetector() - result = model_hed(img) - return [result] - - -model_mlsd = None - - -def mlsd(img, res, thr_v, thr_d): - img = resize_image(HWC3(img), res) - global model_mlsd - if model_mlsd is None: - from annotator.mlsd import MLSDdetector - model_mlsd = MLSDdetector() - result = model_mlsd(img, thr_v, thr_d) - return [result] - - -model_midas = None - - -def midas(img, res, a): - img = resize_image(HWC3(img), res) - global model_midas - if model_midas is None: - from annotator.midas import MidasDetector - model_midas = MidasDetector() - results = model_midas(img, a) - return results - - -model_openpose = None - - -def openpose(img, res, has_hand): - img = resize_image(HWC3(img), res) - global model_openpose - if model_openpose is None: - from annotator.openpose import OpenposeDetector - model_openpose = OpenposeDetector() - result, _ = model_openpose(img, has_hand) - return [result] - - -model_uniformer = None - - -def uniformer(img, res): - img = resize_image(HWC3(img), res) - global model_uniformer - if model_uniformer is None: - from annotator.uniformer import UniformerDetector - model_uniformer = UniformerDetector() - result = model_uniformer(img) - return [result] - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Canny Edge") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - low_threshold = gr.Slider(label="low_threshold", minimum=1, maximum=255, value=100, step=1) - high_threshold = gr.Slider(label="high_threshold", minimum=1, maximum=255, value=200, step=1) - resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64) - run_button = gr.Button(label="Run") - with gr.Column(): - gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto") - run_button.click(fn=canny, inputs=[input_image, resolution, low_threshold, high_threshold], outputs=[gallery]) - - with gr.Row(): - gr.Markdown("## HED Edge") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64) - run_button = gr.Button(label="Run") - with gr.Column(): - gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto") - run_button.click(fn=hed, inputs=[input_image, resolution], outputs=[gallery]) - - with gr.Row(): - gr.Markdown("## MLSD Edge") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - value_threshold = gr.Slider(label="value_threshold", minimum=0.01, maximum=2.0, value=0.1, step=0.01) - distance_threshold = gr.Slider(label="distance_threshold", minimum=0.01, maximum=20.0, value=0.1, step=0.01) - resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=384, step=64) - run_button = gr.Button(label="Run") - with gr.Column(): - gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto") - run_button.click(fn=mlsd, inputs=[input_image, resolution, value_threshold, distance_threshold], outputs=[gallery]) - - with gr.Row(): - gr.Markdown("## MIDAS Depth and Normal") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - alpha = gr.Slider(label="alpha", minimum=0.1, maximum=20.0, value=6.2, step=0.01) - resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=384, step=64) - run_button = gr.Button(label="Run") - with gr.Column(): - gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto") - run_button.click(fn=midas, inputs=[input_image, resolution, alpha], outputs=[gallery]) - - with gr.Row(): - gr.Markdown("## Openpose") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - hand = gr.Checkbox(label='detect hand', value=False) - resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64) - run_button = gr.Button(label="Run") - with gr.Column(): - gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto") - run_button.click(fn=openpose, inputs=[input_image, resolution, hand], outputs=[gallery]) - - - with gr.Row(): - gr.Markdown("## Uniformer Segmentation") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64) - run_button = gr.Button(label="Run") - with gr.Column(): - gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto") - run_button.click(fn=uniformer, inputs=[input_image, resolution], outputs=[gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_canny2image.py b/gradio_canny2image.py deleted file mode 100644 index 9866cac5b3..0000000000 --- a/gradio_canny2image.py +++ /dev/null @@ -1,97 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.canny import CannyDetector -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_canny = CannyDetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_canny.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, low_threshold, high_threshold): - with torch.no_grad(): - img = resize_image(HWC3(input_image), image_resolution) - H, W, C = img.shape - - detected_map = apply_canny(img, low_threshold, high_threshold) - detected_map = HWC3(detected_map) - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [255 - detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Canny Edge Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - low_threshold = gr.Slider(label="Canny low threshold", minimum=1, maximum=255, value=100, step=1) - high_threshold = gr.Slider(label="Canny high threshold", minimum=1, maximum=255, value=200, step=1) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, low_threshold, high_threshold] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_depth2image.py b/gradio_depth2image.py deleted file mode 100644 index ee678999ae..0000000000 --- a/gradio_depth2image.py +++ /dev/null @@ -1,98 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.midas import MidasDetector -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_midas = MidasDetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_depth.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): - with torch.no_grad(): - input_image = HWC3(input_image) - detected_map, _ = apply_midas(resize_image(input_image, detect_resolution)) - detected_map = HWC3(detected_map) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - - detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Depth Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - detect_resolution = gr.Slider(label="Depth Resolution", minimum=128, maximum=1024, value=384, step=1) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_fake_scribble2image.py b/gradio_fake_scribble2image.py deleted file mode 100644 index a7cd375f75..0000000000 --- a/gradio_fake_scribble2image.py +++ /dev/null @@ -1,102 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.hed import HEDdetector, nms -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_hed = HEDdetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_scribble.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): - with torch.no_grad(): - input_image = HWC3(input_image) - detected_map = apply_hed(resize_image(input_image, detect_resolution)) - detected_map = HWC3(detected_map) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - - detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) - detected_map = nms(detected_map, 127, 3.0) - detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0) - detected_map[detected_map > 4] = 255 - detected_map[detected_map < 255] = 0 - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [255 - detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Fake Scribble Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_hed2image.py b/gradio_hed2image.py deleted file mode 100644 index 1ceff67969..0000000000 --- a/gradio_hed2image.py +++ /dev/null @@ -1,98 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.hed import HEDdetector -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_hed = HEDdetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_hed.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): - with torch.no_grad(): - input_image = HWC3(input_image) - detected_map = apply_hed(resize_image(input_image, detect_resolution)) - detected_map = HWC3(detected_map) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - - detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with HED Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_hough2image.py b/gradio_hough2image.py deleted file mode 100644 index 6095eeb676..0000000000 --- a/gradio_hough2image.py +++ /dev/null @@ -1,100 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.mlsd import MLSDdetector -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_mlsd = MLSDdetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_mlsd.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, value_threshold, distance_threshold): - with torch.no_grad(): - input_image = HWC3(input_image) - detected_map = apply_mlsd(resize_image(input_image, detect_resolution), value_threshold, distance_threshold) - detected_map = HWC3(detected_map) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - - detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [255 - cv2.dilate(detected_map, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Hough Line Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - detect_resolution = gr.Slider(label="Hough Resolution", minimum=128, maximum=1024, value=512, step=1) - value_threshold = gr.Slider(label="Hough value threshold (MLSD)", minimum=0.01, maximum=2.0, value=0.1, step=0.01) - distance_threshold = gr.Slider(label="Hough distance threshold (MLSD)", minimum=0.01, maximum=20.0, value=0.1, step=0.01) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, value_threshold, distance_threshold] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_normal2image.py b/gradio_normal2image.py deleted file mode 100644 index 30aea2f8d4..0000000000 --- a/gradio_normal2image.py +++ /dev/null @@ -1,99 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.midas import MidasDetector -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_midas = MidasDetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_normal.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, bg_threshold): - with torch.no_grad(): - input_image = HWC3(input_image) - _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold) - detected_map = HWC3(detected_map) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - - detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) - - control = torch.from_numpy(detected_map[:, :, ::-1].copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Normal Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - detect_resolution = gr.Slider(label="Normal Resolution", minimum=128, maximum=1024, value=384, step=1) - bg_threshold = gr.Slider(label="Normal background threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.01) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, bg_threshold] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_pose2image.py b/gradio_pose2image.py deleted file mode 100644 index 700973bfab..0000000000 --- a/gradio_pose2image.py +++ /dev/null @@ -1,98 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from annotator.openpose import OpenposeDetector -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -apply_openpose = OpenposeDetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_openpose.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): - with torch.no_grad(): - input_image = HWC3(input_image) - detected_map, _ = apply_openpose(resize_image(input_image, detect_resolution)) - detected_map = HWC3(detected_map) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - - detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Human Pose") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - detect_resolution = gr.Slider(label="OpenPose Resolution", minimum=128, maximum=1024, value=512, step=1) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_scribble2image.py b/gradio_scribble2image.py deleted file mode 100644 index 8abbc25bde..0000000000 --- a/gradio_scribble2image.py +++ /dev/null @@ -1,92 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_scribble.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): - with torch.no_grad(): - img = resize_image(HWC3(input_image), image_resolution) - H, W, C = img.shape - - detected_map = np.zeros_like(img, dtype=np.uint8) - detected_map[np.min(img, axis=2) < 127] = 255 - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [255 - detected_map] + results - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Scribble Maps") - with gr.Row(): - with gr.Column(): - input_image = gr.Image(source='upload', type="numpy") - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_scribble2image_interactive.py b/gradio_scribble2image_interactive.py deleted file mode 100644 index 7308bcc1bb..0000000000 --- a/gradio_scribble2image_interactive.py +++ /dev/null @@ -1,102 +0,0 @@ -from share import * -import config - -import cv2 -import einops -import gradio as gr -import numpy as np -import torch -import random - -from pytorch_lightning import seed_everything -from annotator.util import resize_image, HWC3 -from cldm.model import create_model, load_state_dict -from cldm.ddim_hacked import DDIMSampler - - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_scribble.pth', location='cuda')) -model = model.cuda() -ddim_sampler = DDIMSampler(model) - - -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): - with torch.no_grad(): - img = resize_image(HWC3(input_image['mask'][:, :, 0]), image_resolution) - H, W, C = img.shape - - detected_map = np.zeros_like(img, dtype=np.uint8) - detected_map[np.min(img, axis=2) > 127] = 255 - - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() - - if seed == -1: - seed = random.randint(0, 65535) - seed_everything(seed) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} - un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) - - if config.save_memory: - model.low_vram_shift(is_diffusing=True) - - model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 - samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, - shape, cond, verbose=False, eta=eta, - unconditional_guidance_scale=scale, - unconditional_conditioning=un_cond) - - if config.save_memory: - model.low_vram_shift(is_diffusing=False) - - x_samples = model.decode_first_stage(samples) - x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - - results = [x_samples[i] for i in range(num_samples)] - return [255 - detected_map] + results - - -def create_canvas(w, h): - return np.zeros(shape=(h, w, 3), dtype=np.uint8) + 255 - - -block = gr.Blocks().queue() -with block: - with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Interactive Scribbles") - with gr.Row(): - with gr.Column(): - canvas_width = gr.Slider(label="Canvas Width", minimum=256, maximum=1024, value=512, step=1) - canvas_height = gr.Slider(label="Canvas Height", minimum=256, maximum=1024, value=512, step=1) - create_button = gr.Button(label="Start", value='Open drawing canvas!') - input_image = gr.Image(source='upload', type='numpy', tool='sketch') - gr.Markdown(value='Do not forget to change your brush width to make it thinner. (Gradio do not allow developers to set brush width so you need to do it manually.) ' - 'Just click on the small pencil icon in the upper right corner of the above block.') - create_button.click(fn=create_canvas, inputs=[canvas_width, canvas_height], outputs=[input_image]) - prompt = gr.Textbox(label="Prompt") - run_button = gr.Button(label="Run") - with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) - image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64) - strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01) - guess_mode = gr.Checkbox(label='Guess Mode', value=False) - ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) - scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1) - seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) - eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed') - n_prompt = gr.Textbox(label="Negative Prompt", - value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality') - with gr.Column(): - result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto') - ips = [input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta] - run_button.click(fn=process, inputs=ips, outputs=[result_gallery]) - - -block.launch(server_name='0.0.0.0') diff --git a/gradio_seg2image.py b/gradio_seg2image-sd21-mask_input.py similarity index 57% rename from gradio_seg2image.py rename to gradio_seg2image-sd21-mask_input.py index c3854dc762..db37c613c0 100644 --- a/gradio_seg2image.py +++ b/gradio_seg2image-sd21-mask_input.py @@ -10,47 +10,76 @@ from pytorch_lightning import seed_everything from annotator.util import resize_image, HWC3 -from annotator.uniformer import UniformerDetector from cldm.model import create_model, load_state_dict from cldm.ddim_hacked import DDIMSampler +#输入图片为分割图!!! -apply_uniformer = UniformerDetector() - -model = create_model('./models/cldm_v15.yaml').cpu() -model.load_state_dict(load_state_dict('./models/control_sd15_seg.pth', location='cuda')) +model = create_model('/home/tmp/workspace/old_diff/models/cldm_v21.yaml').cpu() +# /home/data2/yangsp22/code/finetune-output/sam-new/CODA-all/weights/lightning_logs/version_1/checkpoints/epoch=40-step=19999.ckpt +model.load_state_dict(load_state_dict('/home/tmp/workspace/diffusion_app/backend/epoch=9-step=35139.ckpt', location='cuda')) model = model.cuda() ddim_sampler = DDIMSampler(model) -def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): +def process(detected_map, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta): + print(detected_map.shape, "detected_map.shape") + print(prompt, "prompt") + print(a_prompt, "a_prompt") + print(n_prompt, "n_prompt") + print(num_samples, type(num_samples), "num_samples") + print(image_resolution, type(image_resolution), "image_resolution") + print(detect_resolution, type(detect_resolution), "detect_resolution") + print(ddim_steps, type(ddim_steps), "ddim_steps") + print(guess_mode, type(guess_mode), "guess_mode") + print(strength, type(strength), "strength") + print(scale, type(scale), "scale") + print(seed, type(seed), "seed") + print(eta, type(eta), "eta") + print("111") with torch.no_grad(): - input_image = HWC3(input_image) - detected_map = apply_uniformer(resize_image(input_image, detect_resolution)) - img = resize_image(input_image, image_resolution) - H, W, C = img.shape - + detected_map = HWC3(detected_map) # 以输入图片1020 x 1920为例,(h, w, c) (高,宽,channel)(1020, 1920, 3) + img = resize_image(detected_map, image_resolution) # (1020, 1920, 3) -> (512, 960, 3) + H, W, C = img.shape # 512, 960, 3 + print(11111) + # cv2.INTER_NEAREST:最邻近插值 + # 使用 最邻近插值法 将图像 detected_map 调整为指定大小 (W, H) detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 - control = torch.stack([control for _ in range(num_samples)], dim=0) - control = einops.rearrange(control, 'b h w c -> b c h w').clone() + control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0 # shape=torch.size([512,960,3]), dtype=torch.float32, 值在0-1区间 + control = torch.stack([control for _ in range(num_samples)], dim=0) # num_samples=1, shape=torch.size([1,512,960,3]) + control = einops.rearrange(control, 'b h w c -> b c h w').clone() # b h w c -> b c h w if seed == -1: seed = random.randint(0, 65535) - seed_everything(seed) + seed_everything(seed) #随机种子 if config.save_memory: model.low_vram_shift(is_diffusing=False) - + + # !!! + # obstruction_prompt="A construction zone occupies part of the right lane, with cones, workers, and safety signs." + + # !!! + # cond(正向控制条件):图片转成control,prompt+a_prompt作为交叉注意力的控制条件 cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]} + # cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)], + # "obstruction_c_crossattn": [model.get_learned_conditioning([obstruction_prompt] * num_samples)]} + + # !!! + # un_cond(反向控制条件):图片转成control,n_prompt作为交叉注意力的负面控制条件 un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]} - shape = (4, H // 8, W // 8) + # un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)], + # "obstruction_c_crossattn": None} + + shape = (4, H // 8, W // 8) # H 512, W 960 -> shape (4, 64, 120) if config.save_memory: model.low_vram_shift(is_diffusing=True) + # 控制权重 model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 + # samples -> shape=torch.size([1,4,64,120]) samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples, shape, cond, verbose=False, eta=eta, unconditional_guidance_scale=scale, @@ -59,17 +88,17 @@ def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resoluti if config.save_memory: model.low_vram_shift(is_diffusing=False) - x_samples = model.decode_first_stage(samples) + x_samples = model.decode_first_stage(samples) # shape=torch.size([1,3,512,960]) x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8) - results = [x_samples[i] for i in range(num_samples)] + results = [x_samples[i] for i in range(num_samples)] # list return [detected_map] + results block = gr.Blocks().queue() with block: with gr.Row(): - gr.Markdown("## Control Stable Diffusion with Segmentation Maps") + gr.Markdown("## Control Stable Diffusion with Segmentation Maps (Mask input)") with gr.Row(): with gr.Column(): input_image = gr.Image(source='upload', type="numpy") diff --git a/ldm/models/diffusion/ddpm.py b/ldm/models/diffusion/ddpm.py index f71a44af48..9e95e0e73d 100644 --- a/ldm/models/diffusion/ddpm.py +++ b/ldm/models/diffusion/ddpm.py @@ -17,7 +17,8 @@ import itertools from tqdm import tqdm from torchvision.utils import make_grid -from pytorch_lightning.utilities.distributed import rank_zero_only +# from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.rank_zero import rank_zero_only from omegaconf import ListConfig from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config @@ -445,11 +446,11 @@ def training_step(self, batch, batch_idx): logger=True, on_step=True, on_epoch=True) self.log("global_step", self.global_step, - prog_bar=True, logger=True, on_step=True, on_epoch=False) + prog_bar=True, logger=True, on_step=True, on_epoch=False, batch_size=len(batch['txt'])) if self.use_scheduler: lr = self.optimizers().param_groups[0]['lr'] - self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False) + self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False, batch_size=len(batch['txt'])) return loss diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py index 4edd5496b9..8521394dac 100644 --- a/ldm/modules/encoders/modules.py +++ b/ldm/modules/encoders/modules.py @@ -92,7 +92,9 @@ class FrozenCLIPEmbedder(AbstractEncoder): "pooled", "hidden" ] - def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, + # def __init__(self, version="/home/data2/yangsp22/openaiclip-vit-large-patch14", device="cuda", max_length=77, + # freeze=True, layer="last", layer_idx=None): # clip-vit-base-patch32 + def __init__(self, version="clip-vit-base-patch32", device="cuda", max_length=77, freeze=True, layer="last", layer_idx=None): # clip-vit-base-patch32 super().__init__() assert layer in self.LAYERS @@ -140,8 +142,10 @@ class FrozenOpenCLIPEmbedder(AbstractEncoder): "last", "penultimate" ] + # def __init__(self, arch="ViT-H-14", version="/home/data2/yangsp22/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin", device="cuda", max_length=77, + # freeze=True, layer="last"): # version="laion2b_s32b_b79k" def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77, - freeze=True, layer="last"): + freeze=True, layer="last"): # version="laion2b_s32b_b79k" super().__init__() assert layer in self.LAYERS model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version) diff --git a/ldm/util.py b/ldm/util.py index 45cb050ece..ef3183f86b 100644 --- a/ldm/util.py +++ b/ldm/util.py @@ -76,6 +76,7 @@ def instantiate_from_config(config): elif config == "__is_unconditional__": return None raise KeyError("Expected key `target` to instantiate.") + #!!! return get_obj_from_str(config["target"])(**config.get("params", dict())) @@ -84,6 +85,8 @@ def get_obj_from_str(string, reload=False): if reload: module_imp = importlib.import_module(module) importlib.reload(module_imp) + #!!! + # print(getattr(importlib.import_module(module, package=None), cls)) return getattr(importlib.import_module(module, package=None), cls)