From f2f2dd4831f713563b0e16086d61041a6513accf Mon Sep 17 00:00:00 2001 From: Eddie Mattia Date: Fri, 6 Jun 2025 13:53:08 -0700 Subject: [PATCH 1/2] add @model and @huggingface_hub docs --- docs/index.md | 1 + docs/scaling/checkpoint/huggingface-hub.md | 165 ++++++++++++++++++ docs/scaling/checkpoint/introduction.md | 7 +- .../introduction.md | 114 ++++++++++++ sidebars.js | 13 +- 5 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 docs/scaling/checkpoint/huggingface-hub.md create mode 100644 docs/scaling/model-lifecycle-management/introduction.md diff --git a/docs/index.md b/docs/index.md index b5735f2b..4394261b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,6 +42,7 @@ Metaflow makes it easy to build and manage real-life data science, AI, and ML pr - [Managing Dependencies](scaling/dependencies) ✨*New support for `uv`*✨ - [Dealing with Failures](scaling/failures) - [Checkpointing Progress](scaling/checkpoint/introduction) ✨*New*✨ +- [Managing Model Lifecycle](scaling/model-lifecycle-management/introduction) ✨*New*✨ - [Loading and Storing Data](scaling/data) - [Organizing Results](scaling/tagging) - [Accessing Secrets](scaling/secrets) diff --git a/docs/scaling/checkpoint/huggingface-hub.md b/docs/scaling/checkpoint/huggingface-hub.md new file mode 100644 index 00000000..1def9801 --- /dev/null +++ b/docs/scaling/checkpoint/huggingface-hub.md @@ -0,0 +1,165 @@ +# HuggingFace Hub Integration + +Metaflow includes a wrapper around `@checkpoint` specific to assets in the [HuggingFace Hub](https://huggingface.co/), a popular platform for sharing and discovering machine learning models and datasets. The `@huggingface_hub` decorator simplifies the process of downloading, caching, and managing models from the HuggingFace Hub within your Metaflow workflows. + + +## Understanding `@huggingface_hub` + +The `@huggingface_hub` decorator offers two main functionalities: + +1. **Loading models directly in steps**: Using the `load` parameter, you can specify models to be downloaded and cached for use within a step. +2. **Dynamic model downloading**: Through the `current.huggingface_hub.snapshot_download` function, you can download models at runtime and store references for use in subsequent steps. + +All models are stored in the same way the `@checkpoint` decorator stores checkpoints, ensuring efficient caching and versioning. + +## Using `@huggingface_hub` + +### Loading Static Repositories for a Step + +For workflows that require specific, unchanging models or datasets from HuggingFace, you can use the `load` parameter of the `@huggingface_hub` decorator. This approach is useful when models don't change often and can be hard-coded into the flow. + +The `load` parameter can accept arguments in several formats: + +1. A list of strings representing repository IDs: + +```python +@huggingface_hub(load=["bert-base-uncased"]) +@step +def process_text(self): + model_path = current.huggingface_hub.loaded["bert-base-uncased"] + # Use the model... +``` + +2. A list of tuples specifying the repository ID and a local path: + +```python +@huggingface_hub(load=[("bert-base-uncased", "./model_directory")]) +@step +def process_text(self): + model_path = current.huggingface_hub.loaded["bert-base-uncased"] + # Use the model... +``` + +3. A list of dictionaries providing detailed configuration for the download: + +```python +@huggingface_hub(load=[ + { + "repo_id": "bert-base-uncased", + "allow_patterns": ["*.json", "tokenizer.txt"], + "repo_type": "model" + } +]) +@step +def process_text(self): + model_path = current.huggingface_hub.loaded["bert-base-uncased"] + # Use the model... +``` + +The dictionary format allows you to specify all the parameters supported by HuggingFace's [snapshot_download](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/file_download#huggingface_hub.snapshot_download) function, enabling fine-grained control over what gets downloaded. + +### Loading HF Repositories Dynamically + +In some workflows, a model or dataset must be determined at runtime, perhaps based on exogenous system events or results produced by upstream workflows. In these cases, you can use the `current.huggingface_hub.snapshot_download` function to dynamically download and cache models. + +```python +@huggingface_hub +@step +def start(self): + # Download a model from HuggingFace Hub + self.hf_model_reference = current.huggingface_hub.snapshot_download( + repo_id="bert-base-uncased", + allow_patterns=["*.json"] # Only download specific files + ) + self.next(self.end) + +@model(load="hf_model_reference") +@step +def end(self): + model_path = current.model.loaded["hf_model_reference"] + # Use the model... +``` + +The `snapshot_download` function returns a reference to the downloaded model, which can be stored as an artifact and loaded in subsequent steps using the `@model` decorator. If the `force_download` parameter is set to `True`, it will bypass the cache and download the model again. + +## Complete Example + +Here's a complete example that demonstrates both static and dynamic loading of HuggingFace models: + +```python +from metaflow import FlowSpec, step, current, huggingface_hub, model, Parameter, pypi + +class HuggingFaceModelFlow(FlowSpec): + + model_name = Parameter( + "model-name", + help="Name of the HuggingFace model to use", + default="bert-base-uncased" + ) + + @pypi(packages={"huggingface-hub": "0.16.4"}) + @huggingface_hub(load=["google-bert/bert-base-uncased"]) + @step + def start(self): + import os + + # Access the statically loaded model + static_model_path = current.huggingface_hub.loaded["google-bert/bert-base-uncased"] + print(f"Static model loaded at: {static_model_path}") + print(f"Contents: {os.listdir(static_model_path)}") + + # Dynamically download the model specified by the parameter + self.dynamic_model_ref = current.huggingface_hub.snapshot_download( + repo_id=self.model_name, + allow_patterns=["*.json", "*.txt"] # Only download specific files + ) + + print(f"Dynamic model reference saved with key: {self.dynamic_model_ref['key']}") + self.next(self.end) + + @model(load="dynamic_model_ref") + @step + def end(self): + import os + + # Access the dynamically loaded model + model_path = current.model.loaded["dynamic_model_ref"] + print(f"Dynamic model loaded at: {model_path}") + print(f"Contents: {os.listdir(model_path)}") + +if __name__ == "__main__": + HuggingFaceModelFlow() +``` + +You can run this flow with: + +```bash +python flow.py --environment=pypi run --model-name roberta-base +``` + +## Best Practices + +1. **Selective Downloads**: Use the `allow_patterns` parameter to download only the files you need, especially for large models. This can significantly reduce download times and storage requirements. + +2. **Caching Strategy**: Leverage the caching mechanism by default, but use `force_download=True` when you need to ensure you have the latest version of a model. + +3. **Error Handling**: Be prepared to handle network issues or API rate limits when downloading models from HuggingFace Hub. We have observed occasional timeouts. Consider implementing retry logic for robustness. + +4. **Version Pinning**: When using models in production workflows, consider pinning to specific model versions using revision tags to ensure reproducibility. + +5. **Authentication**: For accessing private or gated models, you'll need to set up authentication. You can use the HuggingFace CLI to log in or set the `HF_TOKEN` environment variable. On Outerbounds, you can do this by going to the `Integrations` tab and registering your desired HuggingFace token. + +## Integration with Metaflow's Model Management + +The `@huggingface_hub` decorator integrates seamlessly with Metaflow's model management system. Models downloaded from HuggingFace Hub are stored in Metaflow's datastore, making them available for versioning, tracking, and sharing across your organization. + +When combined with the `@model` decorator, you can create powerful workflows that leverage pre-trained models from HuggingFace Hub, fine-tune them for your specific use case, and manage the entire model lifecycle within Metaflow. + +This integration enables efficient workflows where: + +1. Models are downloaded once and cached for future use +2. Large models can be shared across multiple flows without duplicating storage +3. Model versions are tracked and managed alongside your code +4. Models can be easily deployed to production environments + +By leveraging the `@huggingface_hub` decorator, you can streamline your machine learning workflows and focus on building and deploying models rather than managing infrastructure and dependencies. Continue to the [`@model` documentation](../model-lifecycle-management/introduction) for more details. diff --git a/docs/scaling/checkpoint/introduction.md b/docs/scaling/checkpoint/introduction.md index 83295ec3..cb534a43 100644 --- a/docs/scaling/checkpoint/introduction.md +++ b/docs/scaling/checkpoint/introduction.md @@ -28,7 +28,12 @@ guaranteed to stay backwards compatible. Please share your feedback on ## Installing `@checkpoint` -To use the `@checkpoint` extension, install it with +If you use Metaflow through Outerbounds, this functionality is installed by default when you run +``` +pip install outerbounds +``` + +To use the `@checkpoint` extension with a self-managed Metaflow deployment, install it with ``` pip install metaflow-checkpoint ``` diff --git a/docs/scaling/model-lifecycle-management/introduction.md b/docs/scaling/model-lifecycle-management/introduction.md new file mode 100644 index 00000000..676acab8 --- /dev/null +++ b/docs/scaling/model-lifecycle-management/introduction.md @@ -0,0 +1,114 @@ +# Managing Model Lifecycle + +Metaflow's model management features are built on the same foundation as [checkpointing](../checkpoint/introduction.md). +While checkpointing is primarily about managing state within Metaflow tasks, +model lifecycle management presents several orthogonal operational challenges: + +1. **Version Control**: Tracking different versions of models as they evolve +2. **Metadata Management**: Attaching important information about models (accuracy, training parameters, etc.) +3. **Discoverability**: Finding and loading the right model for inference or further training +4. **Portability**: Moving models between different environments and workflows + +The `@model` decorator addresses these challenges by providing a standardized way to save, load, and manage models +within Metaflow. Like the `@checkpoint` functionality described [in the previous section](../checkpoint/introduction.md), `@model` integrates seamlessly with Metaflow's datastore, making it easy to version, track, and share +models across your organization in an efficient manner. + +:::info +The `@model` decorator is part of the `metaflow-checkpoint` extension, +so developers who self-manage Metaflow and its plugins have to install it separately as +described in [installing `@checkpoint`](/scaling/checkpoint/introduction#installing-checkpoint). +Also, the APIs may change in the future, in contrast to the APIs of core Metaflow which are +guaranteed to stay backwards compatible. Please share your feedback on +[Metaflow Slack](http://slack.outerbounds.co)! +::: + +## Using `@model` + +The `@model` decorator provides a simple interface for saving and loading models within your Metaflow steps. It's +particularly useful for machine learning workflows where you need to train, evaluate, and deploy models. + +Let's demonstrate the functionality with this simple flow that trains an XGBoost model and saves it using the `@model` +decorator: + +```python +from metaflow import FlowSpec, step, model, current, pypi_base + +@pypi_base(python="3.12", packages={"xgboost": "3.0.2"}) +class XGBoostModelFlow(FlowSpec): + + @model + @step + def start(self): + import numpy as np # pylint: disable=import-error + import xgboost as xgb # pylint: disable=import-error + + # Create a simple dataset + X = np.random.rand(100, 10) + y = np.random.randint(2, size=100) + dtrain = xgb.DMatrix(X, label=y) + + # Train a model + param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} + num_round = 2 + bst = xgb.train(param, dtrain, num_round) + + # Save the model to a file + bst.save_model("model.bst") + + # Save the model to Metaflow's datastore + self.xgboost_model = current.model.save( + "model.bst", + label="xgboost_classifier", + metadata={ + "accuracy": 0.95, + "training_params": param, + "num_rounds": num_round + } + ) + self.next(self.end) + + @model(load="xgboost_model") # using the Metaflow artifact key. + @step + def end(self): + import os + import xgboost as xgb # pylint: disable=import-error + + # Access the loaded model in another step + # Whether local or remote mode --> we have cached model for future use downstream + # If remote mode --> we are now passing model state seamlessly across task/pod boundaries + model_path = os.path.join(current.model.loaded["xgboost_model"], "model.bst") + bst = xgb.Booster() + bst.load_model(model_path) + + # Use reloaded model + import numpy as np # pylint: disable=import-error + prediction = bst.predict(xgb.DMatrix([np.random.rand(10)])) + print(f"Prediction: {prediction}") + +if __name__ == "__main__": + XGBoostModelFlow() +``` + +You can run the flow as usual: +``` +python flow.py --environment=pypi run +``` + +The flow demonstrates typical usage of `@model`: + +- `@model` activates the `current.model` object. Here we use it in the start step to upload our model after training completes. The resulting `self.xgboost_model` artifact contains a pointer to the model state, cached in Metaflow's datastore. + +- `current.model.save()` saves a model file or directory to Metaflow's datastore, along with optional metadata and a label. It returns a reference to the saved model that can be stored as an artifact. + +- `@model(load="xgboost_model")` loads the model referenced by the `xgboost_model` artifact created in the previous step. The loaded model is accessible via `current.model.loaded["xgboost_model"]`. + +- The model's metadata is accessible via `current.model.loaded.info["xgboost_model"]`. + +## Observing `@model` through cards + +Try running the above. It will produce a [Metaflow `@card`](/metaflow/visualizing-results/effortless-task-inspection-with-default-cards) detailing your model's state: +``` +python flow.py --environment=pypi card view start +``` + +If a step is decorated with `@model`, it will add information about models saved and loaded in the card, including metadata and lineage information. \ No newline at end of file diff --git a/sidebars.js b/sidebars.js index 906127dc..3813187d 100644 --- a/sidebars.js +++ b/sidebars.js @@ -181,7 +181,18 @@ const sidebars = { items: [ "scaling/checkpoint/checkpoint-ml-libraries", "scaling/checkpoint/selecting-checkpoints", - + "scaling/checkpoint/huggingface-hub", + ], + }, + { + type: "category", + label: "Managing Model Lifecycle", + link: { + type: "doc", + id: "scaling/model-lifecycle-management/introduction", + }, + items: [ + ], }, "scaling/data", From 054cb5c12c60b08fe9b27a93864ba85ed8342c13 Mon Sep 17 00:00:00 2001 From: Eddie Mattia Date: Fri, 6 Jun 2025 14:28:08 -0700 Subject: [PATCH 2/2] align @huggingface_hub install section --- docs/scaling/checkpoint/huggingface-hub.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/scaling/checkpoint/huggingface-hub.md b/docs/scaling/checkpoint/huggingface-hub.md index 1def9801..d92628f1 100644 --- a/docs/scaling/checkpoint/huggingface-hub.md +++ b/docs/scaling/checkpoint/huggingface-hub.md @@ -2,6 +2,14 @@ Metaflow includes a wrapper around `@checkpoint` specific to assets in the [HuggingFace Hub](https://huggingface.co/), a popular platform for sharing and discovering machine learning models and datasets. The `@huggingface_hub` decorator simplifies the process of downloading, caching, and managing models from the HuggingFace Hub within your Metaflow workflows. +:::info +The `@huggingface_hub` decorator is part of the `metaflow-checkpoint` extension, +so developers who self-manage Metaflow and its plugins have to install it separately as +described in [installing `@checkpoint`](/scaling/checkpoint/introduction#installing-checkpoint). +Also, the APIs may change in the future, in contrast to the APIs of core Metaflow which are +guaranteed to stay backwards compatible. Please share your feedback on +[Metaflow Slack](http://slack.outerbounds.co)! +::: ## Understanding `@huggingface_hub`