-
Notifications
You must be signed in to change notification settings - Fork 2.8k
feat: add per-model FP8 layerwise casting for VRAM reduction #8945
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6f52606
bf3bd2e
afe246e
2262d8d
5327df8
6c13fca
0d7b39f
a0df643
06ad3c7
025759f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -139,6 +139,7 @@ def _load_model( | |
| local_files_only=True, | ||
| ) | ||
|
|
||
| model = self._apply_fp8_layerwise_casting(model, config, submodel_type) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this only apply to the v2 VAE and diffusers models? What about GGUF?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GGUF and BnB models are intentionally excluded — they already use their own quantization (typically Q4/Q8), so applying FP8 layerwise casting on top would be redundant and likely conflict with their dequantization logic during inference. |
||
| return model | ||
|
|
||
|
|
||
|
|
@@ -201,6 +202,7 @@ def _load_model( | |
| vae_dtype = self._torch_dtype | ||
| model.to(vae_dtype) | ||
|
|
||
| model = self._apply_fp8_layerwise_casting(model, config, submodel_type) | ||
| return model | ||
|
|
||
| def _convert_flux2_vae_bfl_to_diffusers(self, sd: dict) -> dict: | ||
|
|
@@ -485,7 +487,9 @@ def _load_model( | |
|
|
||
| match submodel_type: | ||
| case SubModelType.Transformer: | ||
| return self._load_from_singlefile(config) | ||
| model = self._load_from_singlefile(config) | ||
| model = self._apply_fp8_layerwise_casting(model, config, submodel_type) | ||
| return model | ||
|
|
||
| raise ValueError( | ||
| f"Only Transformer submodels are currently supported. Received: {submodel_type.value if submodel_type else 'None'}" | ||
|
|
@@ -639,6 +643,7 @@ def _load_model( | |
| else: | ||
| raise e | ||
|
|
||
| result = self._apply_fp8_layerwise_casting(result, config, submodel_type) | ||
| return result | ||
|
|
||
|
|
||
|
|
@@ -715,6 +720,7 @@ def _load_model( | |
| if guidance_emb.linear_2.bias is not None: | ||
| guidance_emb.linear_2.bias.data.zero_() | ||
|
|
||
| result = self._apply_fp8_layerwise_casting(result, config, submodel_type) | ||
| return result | ||
|
|
||
|
|
||
|
|
@@ -732,7 +738,9 @@ def _load_model( | |
|
|
||
| match submodel_type: | ||
| case SubModelType.Transformer: | ||
| return self._load_from_singlefile(config) | ||
| model = self._load_from_singlefile(config) | ||
| model = self._apply_fp8_layerwise_casting(model, config, submodel_type) | ||
| return model | ||
|
|
||
| raise ValueError( | ||
| f"Only Transformer submodels are currently supported. Received: {submodel_type.value if submodel_type else 'None'}" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| import { FormControl, FormLabel, Switch } from '@invoke-ai/ui-library'; | ||
| import { InformationalPopover } from 'common/components/InformationalPopover/InformationalPopover'; | ||
| import type { ChangeEvent } from 'react'; | ||
| import { memo, useCallback, useMemo } from 'react'; | ||
| import type { UseControllerProps } from 'react-hook-form'; | ||
| import { useController } from 'react-hook-form'; | ||
| import { useTranslation } from 'react-i18next'; | ||
|
|
||
| import type { ControlAdapterModelDefaultSettingsFormData } from './ControlAdapterModelDefaultSettings'; | ||
|
|
||
| type DefaultFp8StorageType = ControlAdapterModelDefaultSettingsFormData['fp8Storage']; | ||
|
|
||
| export const DefaultFp8StorageControlAdapter = memo( | ||
| (props: UseControllerProps<ControlAdapterModelDefaultSettingsFormData>) => { | ||
| const { t } = useTranslation(); | ||
| const { field } = useController(props); | ||
|
|
||
| const onChange = useCallback( | ||
| (e: ChangeEvent<HTMLInputElement>) => { | ||
| const updatedValue = { | ||
| ...(field.value as DefaultFp8StorageType), | ||
| value: e.target.checked, | ||
| isEnabled: e.target.checked, | ||
| }; | ||
| field.onChange(updatedValue); | ||
| }, | ||
| [field] | ||
| ); | ||
|
|
||
| const value = useMemo(() => { | ||
| return (field.value as DefaultFp8StorageType).value; | ||
| }, [field.value]); | ||
|
|
||
| return ( | ||
| <FormControl> | ||
| <InformationalPopover feature="fp8Storage"> | ||
| <FormLabel>{t('modelManager.fp8Storage')}</FormLabel> | ||
| </InformationalPopover> | ||
| <Switch isChecked={value} onChange={onChange} /> | ||
| </FormControl> | ||
| ); | ||
| } | ||
| ); | ||
|
|
||
| DefaultFp8StorageControlAdapter.displayName = 'DefaultFp8StorageControlAdapter'; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this check
self._get_execution_device()to make sure the model is to be executed on cuda?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It checks "does the system even have CUDA?", not "does this model run on CUDA?". Both lead to the same result, but _torch_device is semantically a better fit for a hardware capability check. The only difference is that _get_execution_device() requires config and submodel_type as parameters, while _torch_device does not.