Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions invokeai/backend/model_manager/configs/controlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@
class ControlAdapterDefaultSettings(BaseModel):
# This could be narrowed to controlnet processor nodes, but they change. Leaving this a string is safer.
preprocessor: str | None
fp8_storage: bool | None = Field(
default=None,
description="Store weights in FP8 to reduce VRAM usage (~50% savings). Weights are cast to compute dtype during inference.",
)
model_config = ConfigDict(extra="forbid")

@classmethod
Expand Down
4 changes: 4 additions & 0 deletions invokeai/backend/model_manager/configs/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ class MainModelDefaultSettings(BaseModel):
height: int | None = Field(default=None, multiple_of=8, ge=64, description="Default height for this model")
guidance: float | None = Field(default=None, ge=1, description="Default Guidance for this model")
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")
fp8_storage: bool | None = Field(
default=None,
description="Store weights in FP8 to reduce VRAM usage (~50% savings). Weights are cast to compute dtype during inference.",
)

model_config = ConfigDict(extra="forbid")

Expand Down
102 changes: 102 additions & 0 deletions invokeai/backend/model_manager/load/load_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,108 @@ def get_size_fs(
variant=config.repo_variant if isinstance(config, Diffusers_Config_Base) else None,
)

def _should_use_fp8(self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> bool:
"""Check if FP8 layerwise casting should be applied to a model."""
# FP8 storage only works on CUDA
if self._torch_device.type != "cuda":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this check self._get_execution_device() to make sure the model is to be executed on cuda?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It checks "does the system even have CUDA?", not "does this model run on CUDA?". Both lead to the same result, but _torch_device is semantically a better fit for a hardware capability check. The only difference is that _get_execution_device() requires config and submodel_type as parameters, while _torch_device does not.

return False

# Z-Image has dtype mismatch issues with diffusers' layerwise casting
# (skipped modules produce bf16, hooked modules expect fp16).
from invokeai.backend.model_manager.taxonomy import BaseModelType

if hasattr(config, "base") and config.base == BaseModelType.ZImage:
return False

# Don't apply FP8 to text encoders, tokenizers, schedulers, etc.
_excluded_submodel_types = {
SubModelType.TextEncoder,
SubModelType.TextEncoder2,
SubModelType.TextEncoder3,
SubModelType.Tokenizer,
SubModelType.Tokenizer2,
SubModelType.Tokenizer3,
SubModelType.Scheduler,
SubModelType.SafetyChecker,
}
if submodel_type in _excluded_submodel_types:
return False

# Check default_settings.fp8_storage (Main models, ControlNet)
if hasattr(config, "default_settings") and config.default_settings is not None:
if hasattr(config.default_settings, "fp8_storage") and config.default_settings.fp8_storage is True:
return True

return False

def _apply_fp8_layerwise_casting(
self, model: AnyModel, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None
) -> AnyModel:
"""Apply FP8 layerwise casting to a model if enabled in its config."""
if not self._should_use_fp8(config, submodel_type):
return model

storage_dtype = torch.float8_e4m3fn
compute_dtype = self._torch_dtype

# Detect the model's current dtype to use as compute dtype, since models
# (e.g. Flux) may require a specific dtype (bf16) that differs from the global torch dtype (fp16).
if isinstance(model, torch.nn.Module):
first_param = next(model.parameters(), None)
if first_param is not None:
compute_dtype = first_param.dtype

from diffusers.models.modeling_utils import ModelMixin

if isinstance(model, ModelMixin):
model.enable_layerwise_casting(
storage_dtype=storage_dtype,
compute_dtype=compute_dtype,
)
elif isinstance(model, torch.nn.Module):
self._apply_fp8_to_nn_module(model, storage_dtype=storage_dtype, compute_dtype=compute_dtype)
else:
return model

param_bytes = sum(p.nelement() * p.element_size() for p in model.parameters())
self._logger.info(
f"FP8 layerwise casting enabled for {config.name} "
f"(storage=float8_e4m3fn, compute={compute_dtype}, "
f"param_size={param_bytes / (1024**2):.0f}MB)"
)
return model

@staticmethod
def _apply_fp8_to_nn_module(model: torch.nn.Module, storage_dtype: torch.dtype, compute_dtype: torch.dtype) -> None:
"""Apply FP8 layerwise casting to a plain nn.Module using forward hooks."""
for module in model.modules():
params = list(module.parameters(recurse=False))
if not params:
continue

# Convert this module's own parameters to FP8 storage dtype
for param in params:
param.data = param.data.to(storage_dtype)

# Pre-hook: cast to compute dtype before forward
def _make_pre_hook(dt: torch.dtype):
def hook(mod: torch.nn.Module, _args: object) -> None:
for p in mod.parameters(recurse=False):
p.data = p.data.to(dt)

return hook

# Post-hook: cast back to storage dtype after forward
def _make_post_hook(dt: torch.dtype):
def hook(mod: torch.nn.Module, _args: object, _output: object) -> None:
for p in mod.parameters(recurse=False):
p.data = p.data.to(dt)

return hook

module.register_forward_pre_hook(_make_pre_hook(compute_dtype))
module.register_forward_hook(_make_post_hook(storage_dtype))

# This needs to be implemented in the subclass
def _load_model(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ def _load_model(
else:
raise e

result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ def _load_model(
submodel_type: Optional[SubModelType] = None,
) -> AnyModel:
if isinstance(config, ControlNet_Checkpoint_Config_Base):
return ControlNetModel.from_single_file(
result = ControlNetModel.from_single_file(
config.path,
torch_dtype=self._torch_dtype,
)
result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result
else:
return super()._load_model(config, submodel_type)
12 changes: 10 additions & 2 deletions invokeai/backend/model_manager/load/model_loaders/flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def _load_model(
local_files_only=True,
)

model = self._apply_fp8_layerwise_casting(model, config, submodel_type)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this only apply to the v2 VAE and diffusers models? What about GGUF?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GGUF and BnB models are intentionally excluded — they already use their own quantization (typically Q4/Q8), so applying FP8 layerwise casting on top would be redundant and likely conflict with their dequantization logic during inference.

return model


Expand Down Expand Up @@ -201,6 +202,7 @@ def _load_model(
vae_dtype = self._torch_dtype
model.to(vae_dtype)

model = self._apply_fp8_layerwise_casting(model, config, submodel_type)
return model

def _convert_flux2_vae_bfl_to_diffusers(self, sd: dict) -> dict:
Expand Down Expand Up @@ -485,7 +487,9 @@ def _load_model(

match submodel_type:
case SubModelType.Transformer:
return self._load_from_singlefile(config)
model = self._load_from_singlefile(config)
model = self._apply_fp8_layerwise_casting(model, config, submodel_type)
return model

raise ValueError(
f"Only Transformer submodels are currently supported. Received: {submodel_type.value if submodel_type else 'None'}"
Expand Down Expand Up @@ -639,6 +643,7 @@ def _load_model(
else:
raise e

result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result


Expand Down Expand Up @@ -715,6 +720,7 @@ def _load_model(
if guidance_emb.linear_2.bias is not None:
guidance_emb.linear_2.bias.data.zero_()

result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result


Expand All @@ -732,7 +738,9 @@ def _load_model(

match submodel_type:
case SubModelType.Transformer:
return self._load_from_singlefile(config)
model = self._load_from_singlefile(config)
model = self._apply_fp8_layerwise_casting(model, config, submodel_type)
return model

raise ValueError(
f"Only Transformer submodels are currently supported. Received: {submodel_type.value if submodel_type else 'None'}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _load_model(
result = model_class.from_pretrained(model_path, torch_dtype=self._torch_dtype, local_files_only=True)
else:
raise e
result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result

# TO DO: Add exception handling
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def _load_model(
else:
raise e

result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result

def _load_from_singlefile(
Expand Down Expand Up @@ -152,5 +153,8 @@ def _load_from_singlefile(
if subtype == submodel_type:
continue
if submodel := getattr(pipeline, subtype.value, None):
self._apply_fp8_layerwise_casting(submodel, config, subtype)
self._ram_cache.put(get_model_cache_key(config.key, subtype), model=submodel)
return getattr(pipeline, submodel_type.value)
result = getattr(pipeline, submodel_type.value)
result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result
4 changes: 3 additions & 1 deletion invokeai/backend/model_manager/load/model_loaders/vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ def _load_model(
submodel_type: Optional[SubModelType] = None,
) -> AnyModel:
if isinstance(config, VAE_Checkpoint_Config_Base):
return AutoencoderKL.from_single_file(
result = AutoencoderKL.from_single_file(
config.path,
torch_dtype=self._torch_dtype,
)
result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result
else:
return super()._load_model(config, submodel_type)
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def _load_model(
else:
raise e

result = self._apply_fp8_layerwise_casting(result, config, submodel_type)
return result


Expand Down
8 changes: 8 additions & 0 deletions invokeai/frontend/web/public/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,7 @@
"convertToDiffusersHelpText5": "Please make sure you have enough disk space. Models generally vary between 2GB-7GB in size.",
"convertToDiffusersHelpText6": "Do you wish to convert this model?",
"cpuOnly": "CPU Only",
"fp8Storage": "FP8 Storage (Save VRAM)",
"runOnCpu": "Run text encoder model on CPU only",
"noDefaultSettings": "No default settings configured for this model. Visit the Model Manager to add default settings.",
"defaultSettings": "Default Settings",
Expand Down Expand Up @@ -2198,6 +2199,13 @@
"When enabled, only the text encoder component will run on CPU instead of GPU.",
"This saves VRAM for the denoiser while only slightly impacting performance. The conditioning outputs are automatically moved to GPU for the denoiser."
]
},
"fp8Storage": {
"heading": "FP8 Storage",
"paragraphs": [
"Stores model weights in FP8 format in VRAM, reducing memory usage by approximately 50% compared to FP16.",
"During inference, weights are cast layer-by-layer to the compute precision (FP16/BF16), so image quality is preserved. Works on all CUDA GPUs."
]
}
},
"workflows": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ export type Feature =
| 'tileOverlap'
| 'optimizedDenoising'
| 'fluxDevLicense'
| 'cpuOnly';
| 'cpuOnly'
| 'fp8Storage';

export type PopoverData = PopoverProps & {
image?: string;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ export const useControlAdapterModelDefaultSettings = (
isEnabled: !isNil(modelConfig?.default_settings?.preprocessor),
value: modelConfig?.default_settings?.preprocessor || 'none',
},
fp8Storage: {
isEnabled: !isNil(modelConfig?.default_settings?.fp8_storage),
value: modelConfig?.default_settings?.fp8_storage ?? false,
},
};
}, [modelConfig?.default_settings]);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ export const useMainModelDefaultSettings = (modelConfig: MainModelConfig) => {
isEnabled: !isNil(modelConfig?.default_settings?.guidance),
value: modelConfig?.default_settings?.guidance ?? 4,
},
fp8Storage: {
isEnabled: !isNil(modelConfig?.default_settings?.fp8_storage),
value: modelConfig?.default_settings?.fp8_storage ?? false,
},
};
}, [modelConfig]);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { Button, Flex, Heading, SimpleGrid } from '@invoke-ai/ui-library';
import { useControlAdapterModelDefaultSettings } from 'features/modelManagerV2/hooks/useControlAdapterModelDefaultSettings';
import { useIsModelManagerEnabled } from 'features/modelManagerV2/hooks/useIsModelManagerEnabled';
import { DefaultFp8StorageControlAdapter } from 'features/modelManagerV2/subpanels/ModelPanel/ControlAdapterModelDefaultSettings/DefaultFp8StorageControlAdapter';
import { DefaultPreprocessor } from 'features/modelManagerV2/subpanels/ModelPanel/ControlAdapterModelDefaultSettings/DefaultPreprocessor';
import type { FormField } from 'features/modelManagerV2/subpanels/ModelPanel/MainModelDefaultSettings/MainModelDefaultSettings';
import { toast } from 'features/toast/toast';
Expand All @@ -14,6 +15,7 @@ import type { ControlLoRAModelConfig, ControlNetModelConfig, T2IAdapterModelConf

export type ControlAdapterModelDefaultSettingsFormData = {
preprocessor: FormField<string>;
fp8Storage: FormField<boolean>;
};

type Props = {
Expand All @@ -40,6 +42,7 @@ export const ControlAdapterModelDefaultSettings = memo(({ modelConfig }: Props)
(data) => {
const body = {
preprocessor: data.preprocessor.isEnabled ? data.preprocessor.value : null,
fp8_storage: data.fp8Storage.isEnabled ? data.fp8Storage.value : null,
};

updateModel({
Expand Down Expand Up @@ -88,6 +91,7 @@ export const ControlAdapterModelDefaultSettings = memo(({ modelConfig }: Props)

<SimpleGrid columns={2} gap={8}>
<DefaultPreprocessor control={control} name="preprocessor" />
<DefaultFp8StorageControlAdapter control={control} name="fp8Storage" />
</SimpleGrid>
</>
);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { FormControl, FormLabel, Switch } from '@invoke-ai/ui-library';
import { InformationalPopover } from 'common/components/InformationalPopover/InformationalPopover';
import type { ChangeEvent } from 'react';
import { memo, useCallback, useMemo } from 'react';
import type { UseControllerProps } from 'react-hook-form';
import { useController } from 'react-hook-form';
import { useTranslation } from 'react-i18next';

import type { ControlAdapterModelDefaultSettingsFormData } from './ControlAdapterModelDefaultSettings';

type DefaultFp8StorageType = ControlAdapterModelDefaultSettingsFormData['fp8Storage'];

export const DefaultFp8StorageControlAdapter = memo(
(props: UseControllerProps<ControlAdapterModelDefaultSettingsFormData>) => {
const { t } = useTranslation();
const { field } = useController(props);

const onChange = useCallback(
(e: ChangeEvent<HTMLInputElement>) => {
const updatedValue = {
...(field.value as DefaultFp8StorageType),
value: e.target.checked,
isEnabled: e.target.checked,
};
field.onChange(updatedValue);
},
[field]
);

const value = useMemo(() => {
return (field.value as DefaultFp8StorageType).value;
}, [field.value]);

return (
<FormControl>
<InformationalPopover feature="fp8Storage">
<FormLabel>{t('modelManager.fp8Storage')}</FormLabel>
</InformationalPopover>
<Switch isChecked={value} onChange={onChange} />
</FormControl>
);
}
);

DefaultFp8StorageControlAdapter.displayName = 'DefaultFp8StorageControlAdapter';
Loading
Loading