diff --git a/spras/config/config.py b/spras/config/config.py index 346682f53..2fce035b2 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -73,6 +73,8 @@ def __init__(self, raw_config: dict[str, Any]): self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None + # A dictionary to store dataset categories with their associated dataset labels + self.dataset_categories = None # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None # The hash length SPRAS will use to identify parameter combinations. @@ -124,12 +126,22 @@ def process_datasets(self, raw_config: RawConfig): # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging self.datasets = {} + self.dataset_categories = {} for dataset in raw_config.datasets: label = dataset.label - if label.lower() in [key.lower() for key in self.datasets.keys()]: + if label.casefold() in [key.casefold() for key in self.datasets.keys()]: raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) + # Extra check for conflicting categories which we don't store, yet. + category = dataset.category + if category: + if category.casefold() in [key.casefold() for key in self.datasets.keys()]: + raise ValueError(f"Dataset categories can not appear as (case-insensitive) labels, yet category {category} appears as a label.") + + category_dataset_labels = self.dataset_categories.setdefault(category, []) + category_dataset_labels.append(dataset.label) + # parse gold standard information self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} diff --git a/spras/config/schema.py b/spras/config/schema.py index f99bbe2d7..9f1cea933 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -121,12 +121,14 @@ class Dataset(BaseModel): # validation & coercion logic before we check it against our own # requirements label: Annotated[str, AfterValidator(label_validator("Dataset"))] + category: Optional[str] = None + "The dataset category, for working with dataset collections in the configuration." node_files: list[str] edge_files: list[str] other_files: list[str] data_dir: str - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class GoldStandard(BaseModel): label: Annotated[str, AfterValidator(label_validator("Gold Standard"))] diff --git a/spras/dataset.py b/spras/dataset.py index 1346750e3..6282d1b47 100644 --- a/spras/dataset.py +++ b/spras/dataset.py @@ -1,6 +1,7 @@ import os import pickle as pkl import warnings +from typing import NotRequired, TypedDict import pandas as pd @@ -11,14 +12,26 @@ Methods and intermediate state for loading data and putting it into pandas tables for use by pathway reconstruction algorithms. """ +class DatasetDict(TypedDict): + """ + Type class containing a collection of information pertaining to creating a Dataset + object. See spras/config/schema.py's `Dataset` class for the pydantic formation of `DatasetDict`. + """ + label: str + category: NotRequired[str] + node_files: list[str | os.PathLike] + edge_files: list[str | os.PathLike] + other_files: list[str | os.PathLike] + data_dir: str | os.PathLike class Dataset: NODE_ID = "NODEID" warning_threshold = 0.05 # Threshold for scarcity of columns to warn user - def __init__(self, dataset_dict): + def __init__(self, dataset_dict: DatasetDict): self.label = None + self.category = None self.interactome = None self.node_table = None self.node_set = set() @@ -47,7 +60,7 @@ def from_file(cls, file_name: str): with open(file_name, "rb") as f: return pkl.load(f) - def load_files_from_dict(self, dataset_dict): + def load_files_from_dict(self, dataset_dict: DatasetDict): """ Loads data files from dataset_dict, which is one dataset dictionary from the list in the config file with the fields in the config file. @@ -67,6 +80,7 @@ def load_files_from_dict(self, dataset_dict): """ self.label = dataset_dict["label"] + self.category = dataset_dict["category"] if "category" in dataset_dict else None # Get file paths from config # TODO support multiple edge files @@ -117,7 +131,7 @@ def load_files_from_dict(self, dataset_dict): os.path.join(data_loc, node_file), header=None ) single_node_table.columns = [self.NODE_ID] - new_col_name = node_file.split(".")[0] + new_col_name = str(node_file).split(".")[0] single_node_table[new_col_name] = True # Use only keys from the existing node table so that nodes that are not in the interactome are ignored diff --git a/test/dataset/fixtures/dataless/network.txt b/test/dataset/fixtures/dataless/network.txt new file mode 100644 index 000000000..5dd49410b --- /dev/null +++ b/test/dataset/fixtures/dataless/network.txt @@ -0,0 +1,2 @@ +A B 1 U +B C 0.5 U diff --git a/test/dataset/fixtures/dataless/node-prizes.txt b/test/dataset/fixtures/dataless/node-prizes.txt new file mode 100644 index 000000000..26897b5a6 --- /dev/null +++ b/test/dataset/fixtures/dataless/node-prizes.txt @@ -0,0 +1 @@ +NODEID prize active dummy diff --git a/test/dataset/fixtures/dataless/sources.txt b/test/dataset/fixtures/dataless/sources.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/dataless/targets.txt b/test/dataset/fixtures/dataless/targets.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-headers/network.txt b/test/dataset/fixtures/empty-headers/network.txt new file mode 100644 index 000000000..5dd49410b --- /dev/null +++ b/test/dataset/fixtures/empty-headers/network.txt @@ -0,0 +1,2 @@ +A B 1 U +B C 0.5 U diff --git a/test/dataset/fixtures/empty-headers/node-prizes.txt b/test/dataset/fixtures/empty-headers/node-prizes.txt new file mode 100644 index 000000000..26897b5a6 --- /dev/null +++ b/test/dataset/fixtures/empty-headers/node-prizes.txt @@ -0,0 +1 @@ +NODEID prize active dummy diff --git a/test/dataset/fixtures/empty-headers/sources.txt b/test/dataset/fixtures/empty-headers/sources.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-headers/targets.txt b/test/dataset/fixtures/empty-headers/targets.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-network/network.txt b/test/dataset/fixtures/empty-network/network.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-network/node-prizes.txt b/test/dataset/fixtures/empty-network/node-prizes.txt new file mode 100644 index 000000000..d03c30492 --- /dev/null +++ b/test/dataset/fixtures/empty-network/node-prizes.txt @@ -0,0 +1,3 @@ +NODEID prize active dummy +A 2 true true +C 5.7 true diff --git a/test/dataset/fixtures/empty-network/sources.txt b/test/dataset/fixtures/empty-network/sources.txt new file mode 100644 index 000000000..8c7e5a667 --- /dev/null +++ b/test/dataset/fixtures/empty-network/sources.txt @@ -0,0 +1 @@ +A \ No newline at end of file diff --git a/test/dataset/fixtures/empty-network/targets.txt b/test/dataset/fixtures/empty-network/targets.txt new file mode 100644 index 000000000..7371f47a6 --- /dev/null +++ b/test/dataset/fixtures/empty-network/targets.txt @@ -0,0 +1 @@ +B \ No newline at end of file diff --git a/test/dataset/fixtures/empty/network.txt b/test/dataset/fixtures/empty/network.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty/node-prizes.txt b/test/dataset/fixtures/empty/node-prizes.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty/sources.txt b/test/dataset/fixtures/empty/sources.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/standard/network.txt b/test/dataset/fixtures/standard/network.txt new file mode 100644 index 000000000..5dd49410b --- /dev/null +++ b/test/dataset/fixtures/standard/network.txt @@ -0,0 +1,2 @@ +A B 1 U +B C 0.5 U diff --git a/test/dataset/fixtures/standard/node-prizes.txt b/test/dataset/fixtures/standard/node-prizes.txt new file mode 100644 index 000000000..d03c30492 --- /dev/null +++ b/test/dataset/fixtures/standard/node-prizes.txt @@ -0,0 +1,3 @@ +NODEID prize active dummy +A 2 true true +C 5.7 true diff --git a/test/dataset/fixtures/standard/sources.txt b/test/dataset/fixtures/standard/sources.txt new file mode 100644 index 000000000..f70f10e4d --- /dev/null +++ b/test/dataset/fixtures/standard/sources.txt @@ -0,0 +1 @@ +A diff --git a/test/dataset/fixtures/standard/targets.txt b/test/dataset/fixtures/standard/targets.txt new file mode 100644 index 000000000..3cc58df83 --- /dev/null +++ b/test/dataset/fixtures/standard/targets.txt @@ -0,0 +1 @@ +C diff --git a/test/dataset/test_dataset.py b/test/dataset/test_dataset.py new file mode 100644 index 000000000..4cb988632 --- /dev/null +++ b/test/dataset/test_dataset.py @@ -0,0 +1,60 @@ +from pathlib import Path + +import pandas +import pytest + +from spras.dataset import Dataset + +FIXTURES_PATH = Path('test', 'dataset', 'fixtures') + +class TestDataset: + def test_not_allow_no_cols(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'empty', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'empty' + }) + + def test_not_allow_no_cols_headers(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'empty-headers', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'empty-headers' + }) + + def test_dataless(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'dataless', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'dataless' + }) + + def test_empty_network(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'empty-network', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'empty-network' + }) + + def test_standard(self): + dataset = Dataset({ + 'label': 'empty', + 'edge_files': ['network.txt'], + 'node_files': ['node-prizes.txt', 'sources.txt', 'targets.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'standard' + }) + + assert len(dataset.get_interactome()) == 2 diff --git a/test/test_config.py b/test/test_config.py index f5ec454b7..0d89574e6 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -31,12 +31,14 @@ def get_test_config(): }, "datasets": [{ "label": "alg1", + "category": "category1", "data_dir": "fake", "edge_files": [], "other_files": [], "node_files": [] }, { "label": "alg2", + "category": "category2", "data_dir": "faux", "edge_files": [], "other_files": [], @@ -220,6 +222,28 @@ def test_correct_dataset_label(self): test_config["datasets"] = [test_dict] config.init_global(test_config) # no error should be raised + def test_correct_dataset_category(self): + test_config = get_test_config() + config.init_global(test_config) + assert config.config.dataset_categories + assert len(config.config.dataset_categories["category1"]) == 1 + assert len(config.config.dataset_categories["category2"]) == 1 + + def test_multiple_dataset_category(self): + test_config = get_test_config() + for dataset in test_config["datasets"]: + dataset["category"] = "category1" + config.init_global(test_config) + assert config.config.dataset_categories + assert len(config.config.dataset_categories["category1"]) == 2 + + def test_bad_dataset_category(self): + test_config = get_test_config() + for dataset in test_config["datasets"]: + dataset["category"] = "alg2" + with pytest.raises(ValueError): # categories can not match dataset labels + config.init_global(test_config) + def test_error_gs_label(self): test_config = get_test_config() error_labels = ["test$", "@test'"]