diff --git a/spras/dataset.py b/spras/dataset.py index 1346750e3..c2271235c 100644 --- a/spras/dataset.py +++ b/spras/dataset.py @@ -1,6 +1,7 @@ import os import pickle as pkl import warnings +from typing import TypedDict import pandas as pd @@ -11,13 +12,23 @@ Methods and intermediate state for loading data and putting it into pandas tables for use by pathway reconstruction algorithms. """ +class DatasetDict(TypedDict): + """ + Type class containing a collection of information pertaining to creating a Dataset + object. This layout is replicated directly in SPRAS configuration files. + """ + label: str + node_files: list[str | os.PathLike] + edge_files: list[str | os.PathLike] + other_files: list[str | os.PathLike] + data_dir: str | os.PathLike class Dataset: NODE_ID = "NODEID" warning_threshold = 0.05 # Threshold for scarcity of columns to warn user - def __init__(self, dataset_dict): + def __init__(self, dataset_dict: DatasetDict): self.label = None self.interactome = None self.node_table = None @@ -47,7 +58,7 @@ def from_file(cls, file_name: str): with open(file_name, "rb") as f: return pkl.load(f) - def load_files_from_dict(self, dataset_dict): + def load_files_from_dict(self, dataset_dict: DatasetDict): """ Loads data files from dataset_dict, which is one dataset dictionary from the list in the config file with the fields in the config file. @@ -110,14 +121,14 @@ def load_files_from_dict(self, dataset_dict): # Load generic node tables self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID]) for node_file in node_data_files: - single_node_table = pd.read_table(os.path.join(data_loc, node_file)) + single_node_table = pd.read_table(os.path.join(data_loc, node_file), index_col=False) # If we have only 1 column, assume this is an indicator variable if len(single_node_table.columns) == 1: single_node_table = pd.read_table( os.path.join(data_loc, node_file), header=None ) single_node_table.columns = [self.NODE_ID] - new_col_name = node_file.split(".")[0] + new_col_name = str(node_file).split(".")[0] single_node_table[new_col_name] = True # Use only keys from the existing node table so that nodes that are not in the interactome are ignored diff --git a/test/dataset/fixtures/dataless/network.txt b/test/dataset/fixtures/dataless/network.txt new file mode 100644 index 000000000..5dd49410b --- /dev/null +++ b/test/dataset/fixtures/dataless/network.txt @@ -0,0 +1,2 @@ +A B 1 U +B C 0.5 U diff --git a/test/dataset/fixtures/dataless/node-prizes.txt b/test/dataset/fixtures/dataless/node-prizes.txt new file mode 100644 index 000000000..26897b5a6 --- /dev/null +++ b/test/dataset/fixtures/dataless/node-prizes.txt @@ -0,0 +1 @@ +NODEID prize active dummy diff --git a/test/dataset/fixtures/dataless/sources.txt b/test/dataset/fixtures/dataless/sources.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/dataless/targets.txt b/test/dataset/fixtures/dataless/targets.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-headers/network.txt b/test/dataset/fixtures/empty-headers/network.txt new file mode 100644 index 000000000..5dd49410b --- /dev/null +++ b/test/dataset/fixtures/empty-headers/network.txt @@ -0,0 +1,2 @@ +A B 1 U +B C 0.5 U diff --git a/test/dataset/fixtures/empty-headers/node-prizes.txt b/test/dataset/fixtures/empty-headers/node-prizes.txt new file mode 100644 index 000000000..26897b5a6 --- /dev/null +++ b/test/dataset/fixtures/empty-headers/node-prizes.txt @@ -0,0 +1 @@ +NODEID prize active dummy diff --git a/test/dataset/fixtures/empty-headers/sources.txt b/test/dataset/fixtures/empty-headers/sources.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-headers/targets.txt b/test/dataset/fixtures/empty-headers/targets.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-network/network.txt b/test/dataset/fixtures/empty-network/network.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty-network/node-prizes.txt b/test/dataset/fixtures/empty-network/node-prizes.txt new file mode 100644 index 000000000..d03c30492 --- /dev/null +++ b/test/dataset/fixtures/empty-network/node-prizes.txt @@ -0,0 +1,3 @@ +NODEID prize active dummy +A 2 true true +C 5.7 true diff --git a/test/dataset/fixtures/empty-network/sources.txt b/test/dataset/fixtures/empty-network/sources.txt new file mode 100644 index 000000000..8c7e5a667 --- /dev/null +++ b/test/dataset/fixtures/empty-network/sources.txt @@ -0,0 +1 @@ +A \ No newline at end of file diff --git a/test/dataset/fixtures/empty-network/targets.txt b/test/dataset/fixtures/empty-network/targets.txt new file mode 100644 index 000000000..7371f47a6 --- /dev/null +++ b/test/dataset/fixtures/empty-network/targets.txt @@ -0,0 +1 @@ +B \ No newline at end of file diff --git a/test/dataset/fixtures/empty/network.txt b/test/dataset/fixtures/empty/network.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty/node-prizes.txt b/test/dataset/fixtures/empty/node-prizes.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/empty/sources.txt b/test/dataset/fixtures/empty/sources.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/dataset/fixtures/standard/network.txt b/test/dataset/fixtures/standard/network.txt new file mode 100644 index 000000000..5dd49410b --- /dev/null +++ b/test/dataset/fixtures/standard/network.txt @@ -0,0 +1,2 @@ +A B 1 U +B C 0.5 U diff --git a/test/dataset/fixtures/standard/node-prizes.txt b/test/dataset/fixtures/standard/node-prizes.txt new file mode 100644 index 000000000..d03c30492 --- /dev/null +++ b/test/dataset/fixtures/standard/node-prizes.txt @@ -0,0 +1,3 @@ +NODEID prize active dummy +A 2 true true +C 5.7 true diff --git a/test/dataset/fixtures/standard/sources.txt b/test/dataset/fixtures/standard/sources.txt new file mode 100644 index 000000000..f70f10e4d --- /dev/null +++ b/test/dataset/fixtures/standard/sources.txt @@ -0,0 +1 @@ +A diff --git a/test/dataset/fixtures/standard/targets.txt b/test/dataset/fixtures/standard/targets.txt new file mode 100644 index 000000000..3cc58df83 --- /dev/null +++ b/test/dataset/fixtures/standard/targets.txt @@ -0,0 +1 @@ +C diff --git a/test/dataset/fixtures/toy-372/input-interactome.txt b/test/dataset/fixtures/toy-372/input-interactome.txt new file mode 100644 index 000000000..f252ca4ca --- /dev/null +++ b/test/dataset/fixtures/toy-372/input-interactome.txt @@ -0,0 +1,2 @@ +C D 0.77 U +N O 0.66 U \ No newline at end of file diff --git a/test/dataset/fixtures/toy-372/input-nodes.txt b/test/dataset/fixtures/toy-372/input-nodes.txt new file mode 100644 index 000000000..2efa6f320 --- /dev/null +++ b/test/dataset/fixtures/toy-372/input-nodes.txt @@ -0,0 +1,3 @@ +NODEID prize active dummy sources targets +N +C 5.7 True True diff --git a/test/dataset/test_dataset.py b/test/dataset/test_dataset.py new file mode 100644 index 000000000..52333ca91 --- /dev/null +++ b/test/dataset/test_dataset.py @@ -0,0 +1,82 @@ +from pathlib import Path + +import pandas +import pytest +import numpy as np + +from spras.dataset import Dataset + +FIXTURES_PATH = Path('test', 'dataset', 'fixtures') + +class TestDataset: + def test_not_allow_no_cols(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'empty', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'empty' + }) + + def test_not_allow_no_cols_headers(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'empty-headers', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'empty-headers' + }) + + def test_dataless(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'dataless', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'dataless' + }) + + def test_empty_network(self): + with pytest.raises(pandas.errors.EmptyDataError): + Dataset({ + 'label': 'empty-network', + 'edge_files': ['network.txt'], + 'node_files': ['sources.txt', 'node-prizes.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'empty-network' + }) + + def test_standard(self): + dataset = Dataset({ + 'label': 'empty', + 'edge_files': ['network.txt'], + 'node_files': ['node-prizes.txt', 'sources.txt', 'targets.txt'], + 'other_files': [], + 'data_dir': FIXTURES_PATH / 'standard' + }) + + assert len(dataset.get_interactome()) == 2 + + # 372 is a PR, but for the relevant comment, see + # https://github.com/Reed-CompBio/spras/pull/372/files#r2291953612. + # Note that the input-nodes file has more tabs than the original fixture. + def test_372(self): + dataset = Dataset({ + 'label': 'toy-372', + 'edge_files': ['input-interactome.txt'], + 'node_files': ['input-nodes.txt'], + 'data_dir': FIXTURES_PATH / 'toy-372', + 'other_files': [] + }) + + node_table = dataset.node_table + assert node_table is not None + + assert node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['prize'] == 5.7 + assert node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['active'] == True + + assert np.isnan(node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['sources']) + assert node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['targets'] == True