From f2f2dd4831f713563b0e16086d61041a6513accf Mon Sep 17 00:00:00 2001
From: Eddie Mattia <eddie@outerbounds.com>
Date: Fri, 6 Jun 2025 13:53:08 -0700
Subject: [PATCH 1/2] add @model and @huggingface_hub docs

---
 docs/index.md                                 |   1 +
 docs/scaling/checkpoint/huggingface-hub.md    | 165 ++++++++++++++++++
 docs/scaling/checkpoint/introduction.md       |   7 +-
 .../introduction.md                           | 114 ++++++++++++
 sidebars.js                                   |  13 +-
 5 files changed, 298 insertions(+), 2 deletions(-)
 create mode 100644 docs/scaling/checkpoint/huggingface-hub.md
 create mode 100644 docs/scaling/model-lifecycle-management/introduction.md

diff --git a/docs/index.md b/docs/index.md
index b5735f2b..4394261b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -42,6 +42,7 @@ Metaflow makes it easy to build and manage real-life data science, AI, and ML pr
 - [Managing Dependencies](scaling/dependencies) ✨*New support for `uv`*✨
 - [Dealing with Failures](scaling/failures)
 - [Checkpointing Progress](scaling/checkpoint/introduction) ✨*New*✨
+- [Managing Model Lifecycle](scaling/model-lifecycle-management/introduction) ✨*New*✨
 - [Loading and Storing Data](scaling/data)
 - [Organizing Results](scaling/tagging)
 - [Accessing Secrets](scaling/secrets)
diff --git a/docs/scaling/checkpoint/huggingface-hub.md b/docs/scaling/checkpoint/huggingface-hub.md
new file mode 100644
index 00000000..1def9801
--- /dev/null
+++ b/docs/scaling/checkpoint/huggingface-hub.md
@@ -0,0 +1,165 @@
+# HuggingFace Hub Integration
+
+Metaflow includes a wrapper around `@checkpoint` specific to assets in the [HuggingFace Hub](https://huggingface.co/), a popular platform for sharing and discovering machine learning models and datasets. The `@huggingface_hub` decorator simplifies the process of downloading, caching, and managing models from the HuggingFace Hub within your Metaflow workflows.
+
+
+## Understanding `@huggingface_hub`
+
+The `@huggingface_hub` decorator offers two main functionalities:
+
+1. **Loading models directly in steps**: Using the `load` parameter, you can specify models to be downloaded and cached for use within a step.
+2. **Dynamic model downloading**: Through the `current.huggingface_hub.snapshot_download` function, you can download models at runtime and store references for use in subsequent steps.
+
+All models are stored in the same way the `@checkpoint` decorator stores checkpoints, ensuring efficient caching and versioning.
+
+## Using `@huggingface_hub`
+
+### Loading Static Repositories for a Step
+
+For workflows that require specific, unchanging models or datasets from HuggingFace, you can use the `load` parameter of the `@huggingface_hub` decorator. This approach is useful when models don't change often and can be hard-coded into the flow.
+
+The `load` parameter can accept arguments in several formats:
+
+1. A list of strings representing repository IDs:
+
+```python
+@huggingface_hub(load=["bert-base-uncased"])
+@step
+def process_text(self):
+    model_path = current.huggingface_hub.loaded["bert-base-uncased"]
+    # Use the model...
+```
+
+2. A list of tuples specifying the repository ID and a local path:
+
+```python
+@huggingface_hub(load=[("bert-base-uncased", "./model_directory")])
+@step
+def process_text(self):
+    model_path = current.huggingface_hub.loaded["bert-base-uncased"]
+    # Use the model...
+```
+
+3. A list of dictionaries providing detailed configuration for the download:
+
+```python
+@huggingface_hub(load=[
+    {
+        "repo_id": "bert-base-uncased",
+        "allow_patterns": ["*.json", "tokenizer.txt"],
+        "repo_type": "model"
+    }
+])
+@step
+def process_text(self):
+    model_path = current.huggingface_hub.loaded["bert-base-uncased"]
+    # Use the model...
+```
+
+The dictionary format allows you to specify all the parameters supported by HuggingFace's [snapshot_download](https://huggingface.co/docs/huggingface_hub/main/en/package_reference/file_download#huggingface_hub.snapshot_download) function, enabling fine-grained control over what gets downloaded.
+
+### Loading HF Repositories Dynamically
+
+In some workflows, a model or dataset must be determined at runtime, perhaps based on exogenous system events or results produced by upstream workflows. In these cases, you can use the `current.huggingface_hub.snapshot_download` function to dynamically download and cache models.
+
+```python
+@huggingface_hub
+@step
+def start(self):
+    # Download a model from HuggingFace Hub
+    self.hf_model_reference = current.huggingface_hub.snapshot_download(
+        repo_id="bert-base-uncased",
+        allow_patterns=["*.json"]  # Only download specific files
+    )
+    self.next(self.end)
+
+@model(load="hf_model_reference")
+@step
+def end(self):
+    model_path = current.model.loaded["hf_model_reference"]
+    # Use the model...
+```
+
+The `snapshot_download` function returns a reference to the downloaded model, which can be stored as an artifact and loaded in subsequent steps using the `@model` decorator. If the `force_download` parameter is set to `True`, it will bypass the cache and download the model again.
+
+## Complete Example
+
+Here's a complete example that demonstrates both static and dynamic loading of HuggingFace models:
+
+```python
+from metaflow import FlowSpec, step, current, huggingface_hub, model, Parameter, pypi
+
+class HuggingFaceModelFlow(FlowSpec):
+    
+    model_name = Parameter(
+        "model-name", 
+        help="Name of the HuggingFace model to use",
+        default="bert-base-uncased"
+    )
+    
+    @pypi(packages={"huggingface-hub": "0.16.4"})
+    @huggingface_hub(load=["google-bert/bert-base-uncased"])
+    @step
+    def start(self):
+        import os
+        
+        # Access the statically loaded model
+        static_model_path = current.huggingface_hub.loaded["google-bert/bert-base-uncased"]
+        print(f"Static model loaded at: {static_model_path}")
+        print(f"Contents: {os.listdir(static_model_path)}")
+        
+        # Dynamically download the model specified by the parameter
+        self.dynamic_model_ref = current.huggingface_hub.snapshot_download(
+            repo_id=self.model_name,
+            allow_patterns=["*.json", "*.txt"]  # Only download specific files
+        )
+        
+        print(f"Dynamic model reference saved with key: {self.dynamic_model_ref['key']}")
+        self.next(self.end)
+    
+    @model(load="dynamic_model_ref")
+    @step
+    def end(self):
+        import os
+        
+        # Access the dynamically loaded model
+        model_path = current.model.loaded["dynamic_model_ref"]
+        print(f"Dynamic model loaded at: {model_path}")
+        print(f"Contents: {os.listdir(model_path)}")
+
+if __name__ == "__main__":
+    HuggingFaceModelFlow()
+```
+
+You can run this flow with:
+
+```bash
+python flow.py --environment=pypi run --model-name roberta-base
+```
+
+## Best Practices
+
+1. **Selective Downloads**: Use the `allow_patterns` parameter to download only the files you need, especially for large models. This can significantly reduce download times and storage requirements.
+
+2. **Caching Strategy**: Leverage the caching mechanism by default, but use `force_download=True` when you need to ensure you have the latest version of a model.
+
+3. **Error Handling**: Be prepared to handle network issues or API rate limits when downloading models from HuggingFace Hub. We have observed occasional timeouts. Consider implementing retry logic for robustness.
+
+4. **Version Pinning**: When using models in production workflows, consider pinning to specific model versions using revision tags to ensure reproducibility.
+
+5. **Authentication**: For accessing private or gated models, you'll need to set up authentication. You can use the HuggingFace CLI to log in or set the `HF_TOKEN` environment variable. On Outerbounds, you can do this by going to the `Integrations` tab and registering your desired HuggingFace token.
+
+## Integration with Metaflow's Model Management
+
+The `@huggingface_hub` decorator integrates seamlessly with Metaflow's model management system. Models downloaded from HuggingFace Hub are stored in Metaflow's datastore, making them available for versioning, tracking, and sharing across your organization.
+
+When combined with the `@model` decorator, you can create powerful workflows that leverage pre-trained models from HuggingFace Hub, fine-tune them for your specific use case, and manage the entire model lifecycle within Metaflow.
+
+This integration enables efficient workflows where:
+
+1. Models are downloaded once and cached for future use
+2. Large models can be shared across multiple flows without duplicating storage
+3. Model versions are tracked and managed alongside your code
+4. Models can be easily deployed to production environments
+
+By leveraging the `@huggingface_hub` decorator, you can streamline your machine learning workflows and focus on building and deploying models rather than managing infrastructure and dependencies. Continue to the [`@model` documentation](../model-lifecycle-management/introduction) for more details.
diff --git a/docs/scaling/checkpoint/introduction.md b/docs/scaling/checkpoint/introduction.md
index 83295ec3..cb534a43 100644
--- a/docs/scaling/checkpoint/introduction.md
+++ b/docs/scaling/checkpoint/introduction.md
@@ -28,7 +28,12 @@ guaranteed to stay backwards compatible. Please share your feedback on
 
 ## Installing `@checkpoint`
 
-To use the `@checkpoint` extension, install it with
+If you use Metaflow through Outerbounds, this functionality is installed by default when you run 
+```
+pip install outerbounds
+```
+
+To use the `@checkpoint` extension with a self-managed Metaflow deployment, install it with
 ```
 pip install metaflow-checkpoint
 ```
diff --git a/docs/scaling/model-lifecycle-management/introduction.md b/docs/scaling/model-lifecycle-management/introduction.md
new file mode 100644
index 00000000..676acab8
--- /dev/null
+++ b/docs/scaling/model-lifecycle-management/introduction.md
@@ -0,0 +1,114 @@
+# Managing Model Lifecycle
+
+Metaflow's model management features are built on the same foundation as [checkpointing](../checkpoint/introduction.md).
+While checkpointing is primarily about managing state within Metaflow tasks, 
+model lifecycle management presents several orthogonal operational challenges:
+
+1. **Version Control**: Tracking different versions of models as they evolve
+2. **Metadata Management**: Attaching important information about models (accuracy, training parameters, etc.)
+3. **Discoverability**: Finding and loading the right model for inference or further training
+4. **Portability**: Moving models between different environments and workflows
+
+The `@model` decorator addresses these challenges by providing a standardized way to save, load, and manage models 
+within Metaflow. Like the `@checkpoint` functionality described [in the previous section](../checkpoint/introduction.md), `@model` integrates seamlessly with Metaflow's datastore, making it easy to version, track, and share 
+models across your organization in an efficient manner. 
+
+:::info
+The `@model` decorator is part of the `metaflow-checkpoint` extension, 
+so developers who self-manage Metaflow and its plugins have to install it separately as
+described in [installing `@checkpoint`](/scaling/checkpoint/introduction#installing-checkpoint).
+Also, the APIs may change in the future, in contrast to the APIs of core Metaflow which are
+guaranteed to stay backwards compatible. Please share your feedback on
+[Metaflow Slack](http://slack.outerbounds.co)!
+:::
+
+## Using `@model`
+
+The `@model` decorator provides a simple interface for saving and loading models within your Metaflow steps. It's 
+particularly useful for machine learning workflows where you need to train, evaluate, and deploy models.
+
+Let's demonstrate the functionality with this simple flow that trains an XGBoost model and saves it using the `@model` 
+decorator:
+
+```python
+from metaflow import FlowSpec, step, model, current, pypi_base
+
+@pypi_base(python="3.12", packages={"xgboost": "3.0.2"})
+class XGBoostModelFlow(FlowSpec):
+
+    @model
+    @step
+    def start(self):
+        import numpy as np # pylint: disable=import-error
+        import xgboost as xgb  # pylint: disable=import-error
+
+        # Create a simple dataset
+        X = np.random.rand(100, 10)
+        y = np.random.randint(2, size=100)
+        dtrain = xgb.DMatrix(X, label=y)
+        
+        # Train a model
+        param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
+        num_round = 2
+        bst = xgb.train(param, dtrain, num_round)
+        
+        # Save the model to a file
+        bst.save_model("model.bst")
+        
+        # Save the model to Metaflow's datastore
+        self.xgboost_model = current.model.save(
+            "model.bst",
+            label="xgboost_classifier",
+            metadata={
+                "accuracy": 0.95,
+                "training_params": param,
+                "num_rounds": num_round
+            }
+        )
+        self.next(self.end)
+
+    @model(load="xgboost_model") # using the Metaflow artifact key.
+    @step
+    def end(self):
+        import os
+        import xgboost as xgb  # pylint: disable=import-error
+        
+        # Access the loaded model in another step
+        # Whether local or remote mode --> we have cached model for future use downstream
+        # If remote mode --> we are now passing model state seamlessly across task/pod boundaries
+        model_path = os.path.join(current.model.loaded["xgboost_model"], "model.bst")
+        bst = xgb.Booster()
+        bst.load_model(model_path)
+        
+        # Use reloaded model
+        import numpy as np  # pylint: disable=import-error
+        prediction = bst.predict(xgb.DMatrix([np.random.rand(10)]))
+        print(f"Prediction: {prediction}")
+
+if __name__ == "__main__":
+    XGBoostModelFlow()
+```
+
+You can run the flow as usual:
+```
+python flow.py --environment=pypi run
+```
+
+The flow demonstrates typical usage of `@model`:
+
+- `@model` activates the `current.model` object. Here we use it in the start step to upload our model after training completes. The resulting `self.xgboost_model` artifact contains a pointer to the model state, cached in Metaflow's datastore.
+
+- `current.model.save()` saves a model file or directory to Metaflow's datastore, along with optional metadata and a label. It returns a reference to the saved model that can be stored as an artifact.
+
+- `@model(load="xgboost_model")` loads the model referenced by the `xgboost_model` artifact created in the previous step. The loaded model is accessible via `current.model.loaded["xgboost_model"]`.
+
+- The model's metadata is accessible via `current.model.loaded.info["xgboost_model"]`.
+
+## Observing `@model` through cards
+
+Try running the above. It will produce a [Metaflow `@card`](/metaflow/visualizing-results/effortless-task-inspection-with-default-cards) detailing your model's state:
+```
+python flow.py --environment=pypi card view start
+```
+
+If a step is decorated with `@model`, it will add information about models saved and loaded in the card, including metadata and lineage information.
\ No newline at end of file
diff --git a/sidebars.js b/sidebars.js
index 906127dc..3813187d 100644
--- a/sidebars.js
+++ b/sidebars.js
@@ -181,7 +181,18 @@ const sidebars = {
           items: [
             "scaling/checkpoint/checkpoint-ml-libraries",
             "scaling/checkpoint/selecting-checkpoints",
-            
+            "scaling/checkpoint/huggingface-hub",
+          ],
+        },
+        {
+          type: "category",
+          label: "Managing Model Lifecycle",
+          link: {
+            type: "doc",
+            id: "scaling/model-lifecycle-management/introduction",
+          },
+          items: [
+        
           ],
         },
         "scaling/data",

From 054cb5c12c60b08fe9b27a93864ba85ed8342c13 Mon Sep 17 00:00:00 2001
From: Eddie Mattia <eddie@outerbounds.com>
Date: Fri, 6 Jun 2025 14:28:08 -0700
Subject: [PATCH 2/2] align @huggingface_hub install section

---
 docs/scaling/checkpoint/huggingface-hub.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/scaling/checkpoint/huggingface-hub.md b/docs/scaling/checkpoint/huggingface-hub.md
index 1def9801..d92628f1 100644
--- a/docs/scaling/checkpoint/huggingface-hub.md
+++ b/docs/scaling/checkpoint/huggingface-hub.md
@@ -2,6 +2,14 @@
 
 Metaflow includes a wrapper around `@checkpoint` specific to assets in the [HuggingFace Hub](https://huggingface.co/), a popular platform for sharing and discovering machine learning models and datasets. The `@huggingface_hub` decorator simplifies the process of downloading, caching, and managing models from the HuggingFace Hub within your Metaflow workflows.
 
+:::info
+The `@huggingface_hub` decorator is part of the `metaflow-checkpoint` extension, 
+so developers who self-manage Metaflow and its plugins have to install it separately as
+described in [installing `@checkpoint`](/scaling/checkpoint/introduction#installing-checkpoint).
+Also, the APIs may change in the future, in contrast to the APIs of core Metaflow which are
+guaranteed to stay backwards compatible. Please share your feedback on
+[Metaflow Slack](http://slack.outerbounds.co)!
+:::
 
 ## Understanding `@huggingface_hub`