Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion spras/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ def __init__(self, raw_config: dict[str, Any]):
self.unpack_singularity = False
# A dictionary to store configured datasets against which SPRAS will be run
self.datasets = None
# A dictionary to store dataset categories with their associated dataset labels
self.dataset_categories = None
# A dictionary to store configured gold standard data against output of SPRAS runs
self.gold_standards = None
# The hash length SPRAS will use to identify parameter combinations.
Expand Down Expand Up @@ -124,12 +126,22 @@ def process_datasets(self, raw_config: RawConfig):
# When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
# Convert to dicts to simplify the yaml logging
self.datasets = {}
self.dataset_categories = {}
for dataset in raw_config.datasets:
label = dataset.label
if label.lower() in [key.lower() for key in self.datasets.keys()]:
if label.casefold() in [key.casefold() for key in self.datasets.keys()]:
raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
self.datasets[label] = dict(dataset)

# Extra check for conflicting categories which we don't store, yet.
category = dataset.category
if category:
if category.casefold() in [key.casefold() for key in self.datasets.keys()]:
raise ValueError(f"Dataset categories can not appear as (case-insensitive) labels, yet category {category} appears as a label.")

category_dataset_labels = self.dataset_categories.setdefault(category, [])
category_dataset_labels.append(dataset.label)

# parse gold standard information
self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}

Expand Down
4 changes: 3 additions & 1 deletion spras/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,14 @@ class Dataset(BaseModel):
# validation & coercion logic before we check it against our own
# requirements
label: Annotated[str, AfterValidator(label_validator("Dataset"))]
category: Optional[str] = None
"The dataset category, for working with dataset collections in the configuration."
node_files: list[str]
edge_files: list[str]
other_files: list[str]
data_dir: str

model_config = ConfigDict(extra='forbid')
model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)

class GoldStandard(BaseModel):
label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
Expand Down
20 changes: 17 additions & 3 deletions spras/dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import pickle as pkl
import warnings
from typing import NotRequired, TypedDict

import pandas as pd

Expand All @@ -11,14 +12,26 @@
Methods and intermediate state for loading data and putting it into pandas tables for use by pathway reconstruction algorithms.
"""

class DatasetDict(TypedDict):
"""
Type class containing a collection of information pertaining to creating a Dataset
object. See spras/config/schema.py's `Dataset` class for the pydantic formation of `DatasetDict`.
"""
label: str
category: NotRequired[str]
node_files: list[str | os.PathLike]
edge_files: list[str | os.PathLike]
other_files: list[str | os.PathLike]
data_dir: str | os.PathLike

class Dataset:

NODE_ID = "NODEID"
warning_threshold = 0.05 # Threshold for scarcity of columns to warn user

def __init__(self, dataset_dict):
def __init__(self, dataset_dict: DatasetDict):
self.label = None
self.category = None
self.interactome = None
self.node_table = None
self.node_set = set()
Expand Down Expand Up @@ -47,7 +60,7 @@ def from_file(cls, file_name: str):
with open(file_name, "rb") as f:
return pkl.load(f)

def load_files_from_dict(self, dataset_dict):
def load_files_from_dict(self, dataset_dict: DatasetDict):
"""
Loads data files from dataset_dict, which is one dataset dictionary from the list
in the config file with the fields in the config file.
Expand All @@ -67,6 +80,7 @@ def load_files_from_dict(self, dataset_dict):
"""

self.label = dataset_dict["label"]
self.category = dataset_dict["category"] if "category" in dataset_dict else None

# Get file paths from config
# TODO support multiple edge files
Expand Down Expand Up @@ -117,7 +131,7 @@ def load_files_from_dict(self, dataset_dict):
os.path.join(data_loc, node_file), header=None
)
single_node_table.columns = [self.NODE_ID]
new_col_name = node_file.split(".")[0]
new_col_name = str(node_file).split(".")[0]
single_node_table[new_col_name] = True

# Use only keys from the existing node table so that nodes that are not in the interactome are ignored
Expand Down
2 changes: 2 additions & 0 deletions test/dataset/fixtures/dataless/network.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B 1 U
B C 0.5 U
1 change: 1 addition & 0 deletions test/dataset/fixtures/dataless/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NODEID prize active dummy
Empty file.
Empty file.
2 changes: 2 additions & 0 deletions test/dataset/fixtures/empty-headers/network.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B 1 U
B C 0.5 U
1 change: 1 addition & 0 deletions test/dataset/fixtures/empty-headers/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NODEID prize active dummy
Empty file.
Empty file.
Empty file.
3 changes: 3 additions & 0 deletions test/dataset/fixtures/empty-network/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
NODEID prize active dummy
A 2 true true
C 5.7 true
1 change: 1 addition & 0 deletions test/dataset/fixtures/empty-network/sources.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A
1 change: 1 addition & 0 deletions test/dataset/fixtures/empty-network/targets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
B
Empty file.
Empty file.
Empty file.
2 changes: 2 additions & 0 deletions test/dataset/fixtures/standard/network.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B 1 U
B C 0.5 U
3 changes: 3 additions & 0 deletions test/dataset/fixtures/standard/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
NODEID prize active dummy
A 2 true true
C 5.7 true
1 change: 1 addition & 0 deletions test/dataset/fixtures/standard/sources.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A
1 change: 1 addition & 0 deletions test/dataset/fixtures/standard/targets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
C
60 changes: 60 additions & 0 deletions test/dataset/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from pathlib import Path

import pandas
import pytest

from spras.dataset import Dataset

FIXTURES_PATH = Path('test', 'dataset', 'fixtures')

class TestDataset:
def test_not_allow_no_cols(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'empty',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'empty'
})

def test_not_allow_no_cols_headers(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'empty-headers',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'empty-headers'
})

def test_dataless(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'dataless',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'dataless'
})

def test_empty_network(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'empty-network',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'empty-network'
})

def test_standard(self):
dataset = Dataset({
'label': 'empty',
'edge_files': ['network.txt'],
'node_files': ['node-prizes.txt', 'sources.txt', 'targets.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'standard'
})

assert len(dataset.get_interactome()) == 2
24 changes: 24 additions & 0 deletions test/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ def get_test_config():
},
"datasets": [{
"label": "alg1",
"category": "category1",
"data_dir": "fake",
"edge_files": [],
"other_files": [],
"node_files": []
}, {
"label": "alg2",
"category": "category2",
"data_dir": "faux",
"edge_files": [],
"other_files": [],
Expand Down Expand Up @@ -220,6 +222,28 @@ def test_correct_dataset_label(self):
test_config["datasets"] = [test_dict]
config.init_global(test_config) # no error should be raised

def test_correct_dataset_category(self):
test_config = get_test_config()
config.init_global(test_config)
assert config.config.dataset_categories
assert len(config.config.dataset_categories["category1"]) == 1
assert len(config.config.dataset_categories["category2"]) == 1

def test_multiple_dataset_category(self):
test_config = get_test_config()
for dataset in test_config["datasets"]:
dataset["category"] = "category1"
config.init_global(test_config)
assert config.config.dataset_categories
assert len(config.config.dataset_categories["category1"]) == 2

def test_bad_dataset_category(self):
test_config = get_test_config()
for dataset in test_config["datasets"]:
dataset["category"] = "alg2"
with pytest.raises(ValueError): # categories can not match dataset labels
config.init_global(test_config)

def test_error_gs_label(self):
test_config = get_test_config()
error_labels = ["test$", "@test'"]
Expand Down
Loading