use helper functions in mlflow methods

rcannood · rcannood · commit 1884f811b6a4 · 2025-10-12T08:24:16.000+02:00
diff --git a/src/methods/geneformer_mlflow/config.vsh.yaml b/src/methods/geneformer_mlflow/config.vsh.yaml
@@ -36,28 +36,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/geneformer_mlflow/script.py b/src/methods/geneformer_mlflow/script.py
@@ -1,12 +1,9 @@
 import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
+import numpy as np
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +17,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== Geneformer (MLflow model) ======", flush=True)
 
@@ -45,23 +43,25 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_id"]).rename(
-        columns={"feature_id": "ensembl_id"}
-    ),
-)
-print(input_adata, flush=True)
+n_processors = meta.get("cpus") or os.cpu_count()
+print(f"Available processors: {n_processors}", flush=True)
+
 
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
+def process_geneformer_input(input_adata):
+    """Add Geneformer-specific fields to input AnnData."""
+    input_adata.obs["cell_idx"] = np.arange(input_adata.n_obs)
+    input_adata.obs["n_counts"] = input_adata.X.sum(axis=1)
 
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
+
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_id": "ensembl_id"},
+    model_params={"nproc": n_processors},
+    process_adata=process_geneformer_input,
+)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -85,7 +85,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/scgpt_mlflow/config.vsh.yaml b/src/methods/scgpt_mlflow/config.vsh.yaml
@@ -33,28 +33,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/scgpt_mlflow/script.py b/src/methods/scgpt_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== scGPT (MLflow model) ======", flush=True)
 
@@ -45,22 +41,14 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_name"]),
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_name": "feature_name"},
+    model_params={"gene_col": "feature_name"},
 )
-print(input_adata, flush=True)
-
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
-
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-input_params = {"gene_col": "feature_name"}
-embedding = model.predict(input_df, params=input_params)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -84,7 +72,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/scvi_mlflow/config.vsh.yaml b/src/methods/scvi_mlflow/config.vsh.yaml
@@ -34,26 +34,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/scvi_mlflow/script.py b/src/methods/scvi_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== scVI (MLflow model) ======", flush=True)
 
@@ -49,20 +45,14 @@
 model = mlflow.pyfunc.load_model(model_dir, model_config={"organism": organism})
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(X=adata.X.copy())
-input_adata.var_names = adata.var["feature_id"].values
-input_adata.obs["batch"] = adata.obs["batch"].values
-print(input_adata, flush=True)
-
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
-
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    obs=["batch"],
+    var={"feature_id": "feature_id"}
+)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -86,7 +76,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/transcriptformer_mlflow/config.vsh.yaml b/src/methods/transcriptformer_mlflow/config.vsh.yaml
@@ -38,28 +38,13 @@ resources:
   - path: /src/utils/read_anndata_partial.py
   - path: /src/utils/exit_codes.py
   - path: /src/utils/unpack.py
+  - path: /src/utils/mlflow.py
   - path: requirements.txt
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1
-    setup:
-      - type: docker
-        add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
-        run: sh /uv-installer.sh && rm /uv-installer.sh
-        env: PATH="/root/.local/bin/:$PATH"
-      - type: docker
-        run: uv venv --python 3.11 /opt/venv
-      - type: docker
-        env:
-          - VIRTUAL_ENV=/opt/venv
-          - PATH="/opt/venv/bin:$PATH"
-        add: requirements.txt /requirements.txt
-        run: uv pip install -r /requirements.txt
-      - type: docker
-        run: uv pip install mlflow==3.1.0
-      - type: docker
-        run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: /src/utils/mlflow_docker_setup.yaml
 
 runners:
   - type: executable
diff --git a/src/methods/transcriptformer_mlflow/script.py b/src/methods/transcriptformer_mlflow/script.py
@@ -1,12 +1,7 @@
-import os
 import sys
-import tarfile
-import tempfile
-import zipfile
 
 import anndata as ad
 import mlflow.pyfunc
-import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
-from exit_codes import exit_non_applicable
-from read_anndata_partial import read_anndata
-from unpack import unpack_directory
+from exit_codes import exit_non_applicable  # noqa: E402
+from mlflow import embed  # noqa: E402
+from read_anndata_partial import read_anndata  # noqa: E402
+from unpack import unpack_directory  # noqa: E402
 
 print("====== TranscriptFormer (MLflow model) ======", flush=True)
 
@@ -45,23 +41,20 @@
 model = mlflow.pyfunc.load_model(model_dir)
 print(model, flush=True)
 
-print("\n>>> Writing temporary input H5AD file...", flush=True)
-input_adata = ad.AnnData(
-    X=adata.X.copy(),
-    var=adata.var.filter(items=["feature_id"]).rename(
-        columns={"feature_id": "ensembl_id"}
-    ),
-)
-input_adata.obs["assay"] = "unknown"  # Avoid error if assay is missing
-print(input_adata, flush=True)
-h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
-print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
-input_adata.write(h5ad_file.name)
-del input_adata
 
-print("\n>>> Running model...", flush=True)
-input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
-embedding = model.predict(input_df)
+def process_transcriptformer_input(input_adata):
+    """Add TranscriptFormer-specific fields to input AnnData."""
+    input_adata.obs["assay"] = "unknown"  # Avoid error if assay is missing
+
+
+print("\n>>> Embedding data...", flush=True)
+embedding = embed(
+    adata,
+    model,
+    layers=["counts"],
+    var={"feature_id": "ensembl_id"},
+    process_adata=process_transcriptformer_input,
+)
 
 print("\n>>> Storing output...", flush=True)
 output = ad.AnnData(
@@ -85,7 +78,5 @@
 print("\n>>> Cleaning up temporary files...", flush=True)
 if model_temp is not None:
     model_temp.cleanup()
-h5ad_file.close()
-os.unlink(h5ad_file.name)
 
 print("\n>>> Done!", flush=True)
diff --git a/src/methods/uce_mlflow/config.vsh.yaml b/src/methods/uce_mlflow/config.vsh.yaml
diff --git a/src/methods/uce_mlflow/script.py b/src/methods/uce_mlflow/script.py
diff --git a/src/utils/mlflow.py b/src/utils/mlflow.py
diff --git a/src/utils/mlflow_docker_setup.yaml b/src/utils/mlflow_docker_setup.yaml