Skip to content

Commit 1884f81

Browse files
committed
use helper functions in mlflow methods
1 parent 374d455 commit 1884f81

File tree

12 files changed

+270
-206
lines changed

12 files changed

+270
-206
lines changed

src/methods/geneformer_mlflow/config.vsh.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,28 +36,13 @@ resources:
3636
- path: /src/utils/read_anndata_partial.py
3737
- path: /src/utils/exit_codes.py
3838
- path: /src/utils/unpack.py
39+
- path: /src/utils/mlflow.py
3940
- path: requirements.txt
4041

4142
engines:
4243
- type: docker
4344
image: openproblems/base_pytorch_nvidia:1
44-
setup:
45-
- type: docker
46-
add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
47-
run: sh /uv-installer.sh && rm /uv-installer.sh
48-
env: PATH="/root/.local/bin/:$PATH"
49-
- type: docker
50-
run: uv venv --python 3.11 /opt/venv
51-
- type: docker
52-
env:
53-
- VIRTUAL_ENV=/opt/venv
54-
- PATH="/opt/venv/bin:$PATH"
55-
add: requirements.txt /requirements.txt
56-
run: uv pip install -r /requirements.txt
57-
- type: docker
58-
run: uv pip install mlflow==3.1.0
59-
- type: docker
60-
run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
45+
__merge__: /src/utils/mlflow_docker_setup.yaml
6146

6247
runners:
6348
- type: executable

src/methods/geneformer_mlflow/script.py

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
import os
22
import sys
3-
import tarfile
4-
import tempfile
5-
import zipfile
63

74
import anndata as ad
85
import mlflow.pyfunc
9-
import pandas as pd
6+
import numpy as np
107

118
## VIASH START
129
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +17,10 @@
2017
## VIASH END
2118

2219
sys.path.append(meta["resources_dir"])
23-
from exit_codes import exit_non_applicable
24-
from read_anndata_partial import read_anndata
25-
from unpack import unpack_directory
20+
from exit_codes import exit_non_applicable # noqa: E402
21+
from mlflow import embed # noqa: E402
22+
from read_anndata_partial import read_anndata # noqa: E402
23+
from unpack import unpack_directory # noqa: E402
2624

2725
print("====== Geneformer (MLflow model) ======", flush=True)
2826

@@ -45,23 +43,25 @@
4543
model = mlflow.pyfunc.load_model(model_dir)
4644
print(model, flush=True)
4745

48-
print("\n>>> Writing temporary input H5AD file...", flush=True)
49-
input_adata = ad.AnnData(
50-
X=adata.X.copy(),
51-
var=adata.var.filter(items=["feature_id"]).rename(
52-
columns={"feature_id": "ensembl_id"}
53-
),
54-
)
55-
print(input_adata, flush=True)
46+
n_processors = meta.get("cpus") or os.cpu_count()
47+
print(f"Available processors: {n_processors}", flush=True)
48+
5649

57-
h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
58-
print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
59-
input_adata.write(h5ad_file.name)
60-
del input_adata
50+
def process_geneformer_input(input_adata):
51+
"""Add Geneformer-specific fields to input AnnData."""
52+
input_adata.obs["cell_idx"] = np.arange(input_adata.n_obs)
53+
input_adata.obs["n_counts"] = input_adata.X.sum(axis=1)
6154

62-
print("\n>>> Running model...", flush=True)
63-
input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
64-
embedding = model.predict(input_df)
55+
56+
print("\n>>> Embedding data...", flush=True)
57+
embedding = embed(
58+
adata,
59+
model,
60+
layers=["counts"],
61+
var={"feature_id": "ensembl_id"},
62+
model_params={"nproc": n_processors},
63+
process_adata=process_geneformer_input,
64+
)
6565

6666
print("\n>>> Storing output...", flush=True)
6767
output = ad.AnnData(
@@ -85,7 +85,5 @@
8585
print("\n>>> Cleaning up temporary files...", flush=True)
8686
if model_temp is not None:
8787
model_temp.cleanup()
88-
h5ad_file.close()
89-
os.unlink(h5ad_file.name)
9088

9189
print("\n>>> Done!", flush=True)

src/methods/scgpt_mlflow/config.vsh.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,28 +33,13 @@ resources:
3333
- path: /src/utils/read_anndata_partial.py
3434
- path: /src/utils/exit_codes.py
3535
- path: /src/utils/unpack.py
36+
- path: /src/utils/mlflow.py
3637
- path: requirements.txt
3738

3839
engines:
3940
- type: docker
4041
image: openproblems/base_pytorch_nvidia:1
41-
setup:
42-
- type: docker
43-
add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
44-
run: sh /uv-installer.sh && rm /uv-installer.sh
45-
env: PATH="/root/.local/bin/:$PATH"
46-
- type: docker
47-
run: uv venv --python 3.11 /opt/venv
48-
- type: docker
49-
env:
50-
- VIRTUAL_ENV=/opt/venv
51-
- PATH="/opt/venv/bin:$PATH"
52-
add: requirements.txt /requirements.txt
53-
run: uv pip install -r /requirements.txt
54-
- type: docker
55-
run: uv pip install mlflow==3.1.0
56-
- type: docker
57-
run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
42+
__merge__: /src/utils/mlflow_docker_setup.yaml
5843

5944
runners:
6045
- type: executable

src/methods/scgpt_mlflow/script.py

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
1-
import os
21
import sys
3-
import tarfile
4-
import tempfile
5-
import zipfile
62

73
import anndata as ad
84
import mlflow.pyfunc
9-
import pandas as pd
105

116
## VIASH START
127
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
2015
## VIASH END
2116

2217
sys.path.append(meta["resources_dir"])
23-
from exit_codes import exit_non_applicable
24-
from read_anndata_partial import read_anndata
25-
from unpack import unpack_directory
18+
from exit_codes import exit_non_applicable # noqa: E402
19+
from mlflow import embed # noqa: E402
20+
from read_anndata_partial import read_anndata # noqa: E402
21+
from unpack import unpack_directory # noqa: E402
2622

2723
print("====== scGPT (MLflow model) ======", flush=True)
2824

@@ -45,22 +41,14 @@
4541
model = mlflow.pyfunc.load_model(model_dir)
4642
print(model, flush=True)
4743

48-
print("\n>>> Writing temporary input H5AD file...", flush=True)
49-
input_adata = ad.AnnData(
50-
X=adata.X.copy(),
51-
var=adata.var.filter(items=["feature_name"]),
44+
print("\n>>> Embedding data...", flush=True)
45+
embedding = embed(
46+
adata,
47+
model,
48+
layers=["counts"],
49+
var={"feature_name": "feature_name"},
50+
model_params={"gene_col": "feature_name"},
5251
)
53-
print(input_adata, flush=True)
54-
55-
h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
56-
print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
57-
input_adata.write(h5ad_file.name)
58-
del input_adata
59-
60-
print("\n>>> Running model...", flush=True)
61-
input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
62-
input_params = {"gene_col": "feature_name"}
63-
embedding = model.predict(input_df, params=input_params)
6452

6553
print("\n>>> Storing output...", flush=True)
6654
output = ad.AnnData(
@@ -84,7 +72,5 @@
8472
print("\n>>> Cleaning up temporary files...", flush=True)
8573
if model_temp is not None:
8674
model_temp.cleanup()
87-
h5ad_file.close()
88-
os.unlink(h5ad_file.name)
8975

9076
print("\n>>> Done!", flush=True)

src/methods/scvi_mlflow/config.vsh.yaml

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,26 +34,13 @@ resources:
3434
- path: /src/utils/read_anndata_partial.py
3535
- path: /src/utils/exit_codes.py
3636
- path: /src/utils/unpack.py
37+
- path: /src/utils/mlflow.py
3738
- path: requirements.txt
3839

3940
engines:
4041
- type: docker
4142
image: openproblems/base_pytorch_nvidia:1
42-
setup:
43-
- type: docker
44-
add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
45-
run: sh /uv-installer.sh && rm /uv-installer.sh
46-
env: PATH="/root/.local/bin/:$PATH"
47-
- type: docker
48-
run: uv venv --python 3.11 /opt/venv
49-
- type: docker
50-
env:
51-
- VIRTUAL_ENV=/opt/venv
52-
- PATH="/opt/venv/bin:$PATH"
53-
add: requirements.txt /requirements.txt
54-
run: uv pip install -r /requirements.txt && uv pip install mlflow==3.1.0
55-
- type: docker
56-
run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
43+
__merge__: /src/utils/mlflow_docker_setup.yaml
5744

5845
runners:
5946
- type: executable

src/methods/scvi_mlflow/script.py

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
1-
import os
21
import sys
3-
import tarfile
4-
import tempfile
5-
import zipfile
62

73
import anndata as ad
84
import mlflow.pyfunc
9-
import pandas as pd
105

116
## VIASH START
127
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
2015
## VIASH END
2116

2217
sys.path.append(meta["resources_dir"])
23-
from exit_codes import exit_non_applicable
24-
from read_anndata_partial import read_anndata
25-
from unpack import unpack_directory
18+
from exit_codes import exit_non_applicable # noqa: E402
19+
from mlflow import embed # noqa: E402
20+
from read_anndata_partial import read_anndata # noqa: E402
21+
from unpack import unpack_directory # noqa: E402
2622

2723
print("====== scVI (MLflow model) ======", flush=True)
2824

@@ -49,20 +45,14 @@
4945
model = mlflow.pyfunc.load_model(model_dir, model_config={"organism": organism})
5046
print(model, flush=True)
5147

52-
print("\n>>> Writing temporary input H5AD file...", flush=True)
53-
input_adata = ad.AnnData(X=adata.X.copy())
54-
input_adata.var_names = adata.var["feature_id"].values
55-
input_adata.obs["batch"] = adata.obs["batch"].values
56-
print(input_adata, flush=True)
57-
58-
h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
59-
print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
60-
input_adata.write(h5ad_file.name)
61-
del input_adata
62-
63-
print("\n>>> Running model...", flush=True)
64-
input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
65-
embedding = model.predict(input_df)
48+
print("\n>>> Embedding data...", flush=True)
49+
embedding = embed(
50+
adata,
51+
model,
52+
layers=["counts"],
53+
obs=["batch"],
54+
var={"feature_id": "feature_id"}
55+
)
6656

6757
print("\n>>> Storing output...", flush=True)
6858
output = ad.AnnData(
@@ -86,7 +76,5 @@
8676
print("\n>>> Cleaning up temporary files...", flush=True)
8777
if model_temp is not None:
8878
model_temp.cleanup()
89-
h5ad_file.close()
90-
os.unlink(h5ad_file.name)
9179

9280
print("\n>>> Done!", flush=True)

src/methods/transcriptformer_mlflow/config.vsh.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,28 +38,13 @@ resources:
3838
- path: /src/utils/read_anndata_partial.py
3939
- path: /src/utils/exit_codes.py
4040
- path: /src/utils/unpack.py
41+
- path: /src/utils/mlflow.py
4142
- path: requirements.txt
4243

4344
engines:
4445
- type: docker
4546
image: openproblems/base_pytorch_nvidia:1
46-
setup:
47-
- type: docker
48-
add: https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh
49-
run: sh /uv-installer.sh && rm /uv-installer.sh
50-
env: PATH="/root/.local/bin/:$PATH"
51-
- type: docker
52-
run: uv venv --python 3.11 /opt/venv
53-
- type: docker
54-
env:
55-
- VIRTUAL_ENV=/opt/venv
56-
- PATH="/opt/venv/bin:$PATH"
57-
add: requirements.txt /requirements.txt
58-
run: uv pip install -r /requirements.txt
59-
- type: docker
60-
run: uv pip install mlflow==3.1.0
61-
- type: docker
62-
run: uv pip install git+https://github.com/openproblems-bio/core#subdirectory=packages/python/openproblems
47+
__merge__: /src/utils/mlflow_docker_setup.yaml
6348

6449
runners:
6550
- type: executable

src/methods/transcriptformer_mlflow/script.py

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
1-
import os
21
import sys
3-
import tarfile
4-
import tempfile
5-
import zipfile
62

73
import anndata as ad
84
import mlflow.pyfunc
9-
import pandas as pd
105

116
## VIASH START
127
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -20,9 +15,10 @@
2015
## VIASH END
2116

2217
sys.path.append(meta["resources_dir"])
23-
from exit_codes import exit_non_applicable
24-
from read_anndata_partial import read_anndata
25-
from unpack import unpack_directory
18+
from exit_codes import exit_non_applicable # noqa: E402
19+
from mlflow import embed # noqa: E402
20+
from read_anndata_partial import read_anndata # noqa: E402
21+
from unpack import unpack_directory # noqa: E402
2622

2723
print("====== TranscriptFormer (MLflow model) ======", flush=True)
2824

@@ -45,23 +41,20 @@
4541
model = mlflow.pyfunc.load_model(model_dir)
4642
print(model, flush=True)
4743

48-
print("\n>>> Writing temporary input H5AD file...", flush=True)
49-
input_adata = ad.AnnData(
50-
X=adata.X.copy(),
51-
var=adata.var.filter(items=["feature_id"]).rename(
52-
columns={"feature_id": "ensembl_id"}
53-
),
54-
)
55-
input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
56-
print(input_adata, flush=True)
57-
h5ad_file = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False)
58-
print(f"Temporary H5AD file: '{h5ad_file.name}'", flush=True)
59-
input_adata.write(h5ad_file.name)
60-
del input_adata
6144

62-
print("\n>>> Running model...", flush=True)
63-
input_df = pd.DataFrame({"input_uri": [h5ad_file.name]})
64-
embedding = model.predict(input_df)
45+
def process_transcriptformer_input(input_adata):
46+
"""Add TranscriptFormer-specific fields to input AnnData."""
47+
input_adata.obs["assay"] = "unknown" # Avoid error if assay is missing
48+
49+
50+
print("\n>>> Embedding data...", flush=True)
51+
embedding = embed(
52+
adata,
53+
model,
54+
layers=["counts"],
55+
var={"feature_id": "ensembl_id"},
56+
process_adata=process_transcriptformer_input,
57+
)
6558

6659
print("\n>>> Storing output...", flush=True)
6760
output = ad.AnnData(
@@ -85,7 +78,5 @@
8578
print("\n>>> Cleaning up temporary files...", flush=True)
8679
if model_temp is not None:
8780
model_temp.cleanup()
88-
h5ad_file.close()
89-
os.unlink(h5ad_file.name)
9081

9182
print("\n>>> Done!", flush=True)

0 commit comments

Comments
 (0)