Reed-CompBio · tristan-f-r · Jun 3, 2025 · Jun 4, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/spras/config/config.py b/spras/config/config.py
@@ -73,6 +73,8 @@ def __init__(self, raw_config: dict[str, Any]):
         self.unpack_singularity = False
         # A dictionary to store configured datasets against which SPRAS will be run
         self.datasets = None
+        # A dictionary to store dataset categories with their associated dataset labels
+        self.dataset_categories = None
         # A dictionary to store configured gold standard data against output of SPRAS runs
         self.gold_standards = None
         # The hash length SPRAS will use to identify parameter combinations.
@@ -124,12 +126,22 @@ def process_datasets(self, raw_config: RawConfig):
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
         self.datasets = {}
+        self.dataset_categories = {}
         for dataset in raw_config.datasets:
             label = dataset.label
-            if label.lower() in [key.lower() for key in self.datasets.keys()]:
+            if label.casefold() in [key.casefold() for key in self.datasets.keys()]:
                 raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
             self.datasets[label] = dict(dataset)
 
+            # Extra check for conflicting categories which we don't store, yet.
+            category = dataset.category
+            if category:
+                if category.casefold() in [key.casefold() for key in self.datasets.keys()]:
+                    raise ValueError(f"Dataset categories can not appear as (case-insensitive) labels, yet category {category} appears as a label.")
+
+                category_dataset_labels = self.dataset_categories.setdefault(category, [])
+                category_dataset_labels.append(dataset.label)
+
         # parse gold standard information
         self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
 

diff --git a/spras/config/schema.py b/spras/config/schema.py
@@ -121,12 +121,14 @@ class Dataset(BaseModel):
     # validation & coercion logic before we check it against our own
     # requirements
     label: Annotated[str, AfterValidator(label_validator("Dataset"))]
+    category: Optional[str] = None
+    "The dataset category, for working with dataset collections in the configuration."
     node_files: list[str]
     edge_files: list[str]
     other_files: list[str]
     data_dir: str
 
-    model_config = ConfigDict(extra='forbid')
+    model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)
 
 class GoldStandard(BaseModel):
     label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]

diff --git a/spras/dataset.py b/spras/dataset.py
@@ -1,6 +1,7 @@
 import os
 import pickle as pkl
 import warnings
+from typing import NotRequired, TypedDict
 
 import pandas as pd
 
@@ -11,14 +12,26 @@
 Methods and intermediate state for loading data and putting it into pandas tables for use by pathway reconstruction algorithms.
 """
 
+class DatasetDict(TypedDict):
+    """
+    Type class containing a collection of information pertaining to creating a Dataset
+    object. See spras/config/schema.py's `Dataset` class for the pydantic formation of `DatasetDict`.
+    """
+    label: str
+    category: NotRequired[str]
+    node_files: list[str | os.PathLike]
+    edge_files: list[str | os.PathLike]
+    other_files: list[str | os.PathLike]
+    data_dir: str | os.PathLike
 
 class Dataset:
 
     NODE_ID = "NODEID"
     warning_threshold = 0.05  # Threshold for scarcity of columns to warn user
 
-    def __init__(self, dataset_dict):
+    def __init__(self, dataset_dict: DatasetDict):
         self.label = None
+        self.category = None
         self.interactome = None
         self.node_table = None
         self.node_set = set()
@@ -47,7 +60,7 @@ def from_file(cls, file_name: str):
         with open(file_name, "rb") as f:
             return pkl.load(f)
 
-    def load_files_from_dict(self, dataset_dict):
+    def load_files_from_dict(self, dataset_dict: DatasetDict):
         """
         Loads data files from dataset_dict, which is one dataset dictionary from the list
         in the config file with the fields in the config file.
@@ -67,6 +80,7 @@ def load_files_from_dict(self, dataset_dict):
         """
 
         self.label = dataset_dict["label"]
+        self.category = dataset_dict["category"] if "category" in dataset_dict else None
 
         # Get file paths from config
         # TODO support multiple edge files
@@ -117,7 +131,7 @@ def load_files_from_dict(self, dataset_dict):
                     os.path.join(data_loc, node_file), header=None
                 )
                 single_node_table.columns = [self.NODE_ID]
-                new_col_name = node_file.split(".")[0]
+                new_col_name = str(node_file).split(".")[0]
                 single_node_table[new_col_name] = True
 
             # Use only keys from the existing node table so that nodes that are not in the interactome are ignored

diff --git a/test/dataset/fixtures/dataless/network.txt b/test/dataset/fixtures/dataless/network.txt
@@ -0,0 +1,2 @@
+A	B	1	U
+B	C	0.5	U
diff --git a/test/dataset/fixtures/dataless/node-prizes.txt b/test/dataset/fixtures/dataless/node-prizes.txt
@@ -0,0 +1 @@
+NODEID	prize	active	dummy
diff --git a/test/dataset/fixtures/dataless/sources.txt b/test/dataset/fixtures/dataless/sources.txt
diff --git a/test/dataset/fixtures/dataless/targets.txt b/test/dataset/fixtures/dataless/targets.txt
diff --git a/test/dataset/fixtures/empty-headers/network.txt b/test/dataset/fixtures/empty-headers/network.txt
@@ -0,0 +1,2 @@
+A	B	1	U
+B	C	0.5	U
diff --git a/test/dataset/fixtures/empty-headers/node-prizes.txt b/test/dataset/fixtures/empty-headers/node-prizes.txt
@@ -0,0 +1 @@
+NODEID	prize	active	dummy
diff --git a/test/dataset/fixtures/empty-headers/sources.txt b/test/dataset/fixtures/empty-headers/sources.txt
diff --git a/test/dataset/fixtures/empty-headers/targets.txt b/test/dataset/fixtures/empty-headers/targets.txt
diff --git a/test/dataset/fixtures/empty-network/network.txt b/test/dataset/fixtures/empty-network/network.txt
diff --git a/test/dataset/fixtures/empty-network/node-prizes.txt b/test/dataset/fixtures/empty-network/node-prizes.txt
@@ -0,0 +1,3 @@
+NODEID	prize	active	dummy
+A	2	true	true
+C	5.7	true
diff --git a/test/dataset/fixtures/empty-network/sources.txt b/test/dataset/fixtures/empty-network/sources.txt
@@ -0,0 +1 @@
+A
diff --git a/test/dataset/fixtures/empty-network/targets.txt b/test/dataset/fixtures/empty-network/targets.txt
@@ -0,0 +1 @@
+B
diff --git a/test/dataset/fixtures/empty/network.txt b/test/dataset/fixtures/empty/network.txt
diff --git a/test/dataset/fixtures/empty/node-prizes.txt b/test/dataset/fixtures/empty/node-prizes.txt
diff --git a/test/dataset/fixtures/empty/sources.txt b/test/dataset/fixtures/empty/sources.txt
diff --git a/test/dataset/fixtures/standard/network.txt b/test/dataset/fixtures/standard/network.txt
@@ -0,0 +1,2 @@
+A	B	1	U
+B	C	0.5	U
diff --git a/test/dataset/fixtures/standard/node-prizes.txt b/test/dataset/fixtures/standard/node-prizes.txt
@@ -0,0 +1,3 @@
+NODEID	prize	active	dummy
+A	2	true	true
+C	5.7	true
diff --git a/test/dataset/fixtures/standard/sources.txt b/test/dataset/fixtures/standard/sources.txt
@@ -0,0 +1 @@
+A
diff --git a/test/dataset/fixtures/standard/targets.txt b/test/dataset/fixtures/standard/targets.txt
@@ -0,0 +1 @@
+C
diff --git a/test/dataset/test_dataset.py b/test/dataset/test_dataset.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+import pandas
+import pytest
+
+from spras.dataset import Dataset
+
+FIXTURES_PATH = Path('test', 'dataset', 'fixtures')
+
+class TestDataset:
+    def test_not_allow_no_cols(self):
+        with pytest.raises(pandas.errors.EmptyDataError):
+            Dataset({
+                'label': 'empty',
+                'edge_files': ['network.txt'],
+                'node_files': ['sources.txt', 'node-prizes.txt'],
+                'other_files': [],
+                'data_dir': FIXTURES_PATH / 'empty'
+            })
+
+    def test_not_allow_no_cols_headers(self):
+        with pytest.raises(pandas.errors.EmptyDataError):
+            Dataset({
+                'label': 'empty-headers',
+                'edge_files': ['network.txt'],
+                'node_files': ['sources.txt', 'node-prizes.txt'],
+                'other_files': [],
+                'data_dir': FIXTURES_PATH / 'empty-headers'
+            })
+
+    def test_dataless(self):
+        with pytest.raises(pandas.errors.EmptyDataError):
+            Dataset({
+                'label': 'dataless',
+                'edge_files': ['network.txt'],
+                'node_files': ['sources.txt', 'node-prizes.txt'],
+                'other_files': [],
+                'data_dir': FIXTURES_PATH / 'dataless'
+            })
+
+    def test_empty_network(self):
+        with pytest.raises(pandas.errors.EmptyDataError):
+            Dataset({
+                'label': 'empty-network',
+                'edge_files': ['network.txt'],
+                'node_files': ['sources.txt', 'node-prizes.txt'],
+                'other_files': [],
+                'data_dir': FIXTURES_PATH / 'empty-network'
+            })
+
+    def test_standard(self):
+        dataset = Dataset({
+            'label': 'empty',
+            'edge_files': ['network.txt'],
+            'node_files': ['node-prizes.txt', 'sources.txt', 'targets.txt'],
+            'other_files': [],
+            'data_dir': FIXTURES_PATH / 'standard'
+        })
+
+        assert len(dataset.get_interactome()) == 2
diff --git a/test/test_config.py b/test/test_config.py
@@ -31,12 +31,14 @@ def get_test_config():
         },
         "datasets": [{
             "label": "alg1",
+            "category": "category1",
             "data_dir": "fake",
             "edge_files": [],
             "other_files": [],
             "node_files": []
         }, {
             "label": "alg2",
+            "category": "category2",
             "data_dir": "faux",
             "edge_files": [],
             "other_files": [],
@@ -220,6 +222,28 @@ def test_correct_dataset_label(self):
             test_config["datasets"] = [test_dict]
             config.init_global(test_config)  # no error should be raised
 
+    def test_correct_dataset_category(self):
+        test_config = get_test_config()
+        config.init_global(test_config)
+        assert config.config.dataset_categories
+        assert len(config.config.dataset_categories["category1"]) == 1
+        assert len(config.config.dataset_categories["category2"]) == 1
+
+    def test_multiple_dataset_category(self):
+        test_config = get_test_config()
+        for dataset in test_config["datasets"]:
+            dataset["category"] = "category1"
+        config.init_global(test_config)
+        assert config.config.dataset_categories
+        assert len(config.config.dataset_categories["category1"]) == 2
+
+    def test_bad_dataset_category(self):
+        test_config = get_test_config()
+        for dataset in test_config["datasets"]:
+            dataset["category"] = "alg2"
+        with pytest.raises(ValueError): # categories can not match dataset labels
+            config.init_global(test_config)
+
     def test_error_gs_label(self):
         test_config = get_test_config()
         error_labels = ["test$", "@test'"]