Sopa dep issues (#93)

LouisK92 · web-flow · commit de162423c969 · 2025-10-14T00:31:21.000+02:00
* Set baysor config via dict

* Update comseg script for sopa 2.1.5

* Convert baysor force_2d parameter to bool
diff --git a/src/methods_transcript_assignment/baysor/config.vsh.yaml b/src/methods_transcript_assignment/baysor/config.vsh.yaml
@@ -21,11 +21,11 @@ arguments:
     default: global
 
   - name: --force_2d
-    type: string
+    type: boolean
     required: false
     description: "Ignores z-column in the data if it is provided"
     direction: input 
-    default: "false"
+    default: true
 
   - name: --min_molecules_per_cell
     type: integer
diff --git a/src/methods_transcript_assignment/baysor/script.py b/src/methods_transcript_assignment/baysor/script.py
@@ -20,7 +20,7 @@
   'coordinate_system': 'global',
   'output': './temp/methods/baysor/baysor_assigned_transcripts.zarr',
 
-  'force_2d': 'false',
+  'force_2d': True, #'false',
   'min_molecules_per_cell': 50,
   'scale': -1.0, #NOTE: For parameter selection see https://github.com/gustaveroussy/sopa/tree/main/workflow/config
   'scale_std': "25%",
@@ -37,7 +37,7 @@
 TMP_DIR = Path(meta["temp_dir"] or "/tmp")
 TMP_DIR.mkdir(parents=True, exist_ok=True)
 
-CONFIG_TOML = TMP_DIR / "config.toml"
+#CONFIG_TOML = TMP_DIR / "config.toml"
 
 
 ##############################
@@ -70,7 +70,7 @@
     label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() 
 else:
     label_image = sdata_segm["segmentation"].to_numpy()
-    
+
 cell_id_dask_series = dask.dataframe.from_dask_array(
     dask.array.from_array(
         label_image[y_coords, x_coords], chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
@@ -91,26 +91,43 @@
     },
 )
 
-# Write config to toml
-print('Writing config to toml', flush=True)
-toml_str = f"""[data]
-x = "x"
-y = "y"
-z = "z"
-gene = "feature_name" 
-force_2d = {par['force_2d']} 
-min_molecules_per_cell = {int(par['min_molecules_per_cell'])}
-exclude_genes = "" 
-
-[segmentation]
-scale = {float(par['scale'])} 
-scale_std = "{par['scale_std']}"
-n_clusters = {int(par['n_clusters'])}
-prior_segmentation_confidence = {float(par['prior_segmentation_confidence'])}
-"""
-with open(CONFIG_TOML, "w") as toml_file:
-    toml_file.write(toml_str)
-
+## Write config to toml #NOTE: lead to an error since sopa v2.1.5, instead use config dict
+#print('Writing config to toml', flush=True)
+#toml_str = f"""[data]
+#x = "x"
+#y = "y"
+#z = "z"
+#gene = "feature_name" 
+#force_2d = {par['force_2d']} 
+#min_molecules_per_cell = {int(par['min_molecules_per_cell'])}
+#exclude_genes = "" 
+#
+#[segmentation]
+#scale = {float(par['scale'])} 
+#scale_std = "{par['scale_std']}"
+#n_clusters = {int(par['n_clusters'])}
+#prior_segmentation_confidence = {float(par['prior_segmentation_confidence'])}
+#"""
+#with open(CONFIG_TOML, "w") as toml_file:
+#    toml_file.write(toml_str)
+
+config = {
+    "data": {
+        "x": "x",
+        "y": "y",
+        "z": "z",
+        "gene": "feature_name",
+        "force_2d": par['force_2d'],
+        "min_molecules_per_cell": int(par['min_molecules_per_cell']),
+        "exclude_genes": "",
+    },
+    "segmentation": {
+        "scale": float(par['scale']),
+        "scale_std": str(par['scale_std']),
+        "n_clusters": int(par['n_clusters']),
+        "prior_segmentation_confidence": float(par['prior_segmentation_confidence']),
+    },
+}
 
 
 # Make transcript patches
@@ -124,7 +141,7 @@
 #       (called with sopa -->) subprocess.CalledProcessError: Command 'baysor ...' returned non-zero exit status 139.
 #       When reproducing the error with `baysor ...` it reports a signal (11.1) Segmentation fault Allocations: 5017730 (Pool: 5013281; Big: 4449); GC: 8
 os.environ['JULIA_NUM_THREADS'] = str(n_threads)
-sopa.segmentation.baysor(sdata_sopa, config=str(CONFIG_TOML))
+sopa.segmentation.baysor(sdata_sopa, config=config) #str(CONFIG_TOML))
 
 # Assign transcripts to cell ids
 sopa.spatial.assign_transcript_to_cell(
diff --git a/src/methods_transcript_assignment/comseg/script.py b/src/methods_transcript_assignment/comseg/script.py
@@ -25,52 +25,6 @@
 }
 ## VIASH END
 
-def fixed_count_transcripts_aligned(geo_df, points, value_key):
-    """
-    The same function as sopa.aggregation.transcripts._count_transcripts_aligned.
-    Minor change just the matrix X is converted to csr_matrix, to avoid bug error in comseg call
-
-    """
-    from scipy.sparse import csr_matrix
-    from anndata import AnnData
-    from dask.diagnostics import ProgressBar
-    from functools import partial
-    from sopa._settings import settings
-    import geopandas as gpd
-    def _add_csr(X_partitions, geo_df, partition, gene_column, gene_names ):
-        if settings.gene_exclude_pattern is not None:
-            partition = partition[~partition[gene_column].str.match(settings.gene_exclude_pattern, case=False, na=False)]
-        
-        points_gdf = gpd.GeoDataFrame(partition, geometry=gpd.points_from_xy(partition["x"], partition["y"]))
-        joined = geo_df.sjoin(points_gdf)
-        cells_indices, column_indices = joined.index, joined[gene_column].cat.codes
-        cells_indices = cells_indices[column_indices >= 0]
-        column_indices = column_indices[column_indices >= 0]
-        X_partition = csr_matrix((np.full(len(cells_indices), 1), (cells_indices, column_indices)),
-            shape=(len(geo_df), len(gene_names)),
-        )
-        X_partitions.append(X_partition)
-    
-    
-    points[value_key] = points[value_key].astype("category").cat.as_known()
-    gene_names = points[value_key].cat.categories.astype(str)
-    X = csr_matrix((len(geo_df), len(gene_names)), dtype=int)
-    adata = AnnData(X=X, var=pd.DataFrame(index=gene_names))
-    adata.obs_names = geo_df.index.astype(str)
-    geo_df = geo_df.reset_index()
-    X_partitions = []
-    with ProgressBar():
-        points.map_partitions(
-            partial(_add_csr, X_partitions, geo_df, gene_column=value_key, gene_names=gene_names),
-            meta=(),
-        ).compute()
-    for X_partition in X_partitions:
-        adata.X += X_partition
-    if settings.gene_exclude_pattern is not None:
-        adata = adata[:, ~adata.var_names.str.match(settings.gene_exclude_pattern, case=False, na=False)].copy()
-    return adata
-
-
 
 # Read input files
 print('Reading input files', flush=True)
@@ -105,7 +59,7 @@ def _add_csr(X_partitions, geo_df, partition, gene_column, gene_names ):
     "gene_column": par["gene_column"],
 }
 
-sopa.aggregation.transcripts._count_transcripts_aligned = fixed_count_transcripts_aligned
+
 # sopa.settings.parallelization_backend = 'dask' #TODO: get parallelization running.
 sopa.segmentation.comseg(sdata, config)