Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions spras/dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import pickle as pkl
import warnings
from typing import TypedDict

import pandas as pd

Expand All @@ -11,13 +12,23 @@
Methods and intermediate state for loading data and putting it into pandas tables for use by pathway reconstruction algorithms.
"""

class DatasetDict(TypedDict):
"""
Type class containing a collection of information pertaining to creating a Dataset
object. This layout is replicated directly in SPRAS configuration files.
"""
label: str
node_files: list[str | os.PathLike]
edge_files: list[str | os.PathLike]
other_files: list[str | os.PathLike]
data_dir: str | os.PathLike

class Dataset:

NODE_ID = "NODEID"
warning_threshold = 0.05 # Threshold for scarcity of columns to warn user

def __init__(self, dataset_dict):
def __init__(self, dataset_dict: DatasetDict):
self.label = None
self.interactome = None
self.node_table = None
Expand Down Expand Up @@ -47,7 +58,7 @@ def from_file(cls, file_name: str):
with open(file_name, "rb") as f:
return pkl.load(f)

def load_files_from_dict(self, dataset_dict):
def load_files_from_dict(self, dataset_dict: DatasetDict):
"""
Loads data files from dataset_dict, which is one dataset dictionary from the list
in the config file with the fields in the config file.
Expand Down Expand Up @@ -110,14 +121,14 @@ def load_files_from_dict(self, dataset_dict):
# Load generic node tables
self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID])
for node_file in node_data_files:
single_node_table = pd.read_table(os.path.join(data_loc, node_file))
single_node_table = pd.read_table(os.path.join(data_loc, node_file), index_col=False)
# If we have only 1 column, assume this is an indicator variable
if len(single_node_table.columns) == 1:
single_node_table = pd.read_table(
os.path.join(data_loc, node_file), header=None
)
single_node_table.columns = [self.NODE_ID]
new_col_name = node_file.split(".")[0]
new_col_name = str(node_file).split(".")[0]
single_node_table[new_col_name] = True

# Use only keys from the existing node table so that nodes that are not in the interactome are ignored
Expand Down
2 changes: 2 additions & 0 deletions test/dataset/fixtures/dataless/network.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B 1 U
B C 0.5 U
1 change: 1 addition & 0 deletions test/dataset/fixtures/dataless/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NODEID prize active dummy
Empty file.
Empty file.
2 changes: 2 additions & 0 deletions test/dataset/fixtures/empty-headers/network.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B 1 U
B C 0.5 U
1 change: 1 addition & 0 deletions test/dataset/fixtures/empty-headers/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NODEID prize active dummy
Empty file.
Empty file.
Empty file.
3 changes: 3 additions & 0 deletions test/dataset/fixtures/empty-network/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
NODEID prize active dummy
A 2 true true
C 5.7 true
1 change: 1 addition & 0 deletions test/dataset/fixtures/empty-network/sources.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A
1 change: 1 addition & 0 deletions test/dataset/fixtures/empty-network/targets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
B
Empty file.
Empty file.
Empty file.
2 changes: 2 additions & 0 deletions test/dataset/fixtures/standard/network.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
A B 1 U
B C 0.5 U
3 changes: 3 additions & 0 deletions test/dataset/fixtures/standard/node-prizes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
NODEID prize active dummy
A 2 true true
C 5.7 true
1 change: 1 addition & 0 deletions test/dataset/fixtures/standard/sources.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A
1 change: 1 addition & 0 deletions test/dataset/fixtures/standard/targets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
C
2 changes: 2 additions & 0 deletions test/dataset/fixtures/toy-372/input-interactome.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
C D 0.77 U
N O 0.66 U
3 changes: 3 additions & 0 deletions test/dataset/fixtures/toy-372/input-nodes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
NODEID prize active dummy sources targets
N
C 5.7 True True
82 changes: 82 additions & 0 deletions test/dataset/test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from pathlib import Path

import pandas
import pytest
import numpy as np

from spras.dataset import Dataset

FIXTURES_PATH = Path('test', 'dataset', 'fixtures')

class TestDataset:
def test_not_allow_no_cols(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'empty',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'empty'
})

def test_not_allow_no_cols_headers(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'empty-headers',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'empty-headers'
})

def test_dataless(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'dataless',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'dataless'
})

def test_empty_network(self):
with pytest.raises(pandas.errors.EmptyDataError):
Dataset({
'label': 'empty-network',
'edge_files': ['network.txt'],
'node_files': ['sources.txt', 'node-prizes.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'empty-network'
})

def test_standard(self):
dataset = Dataset({
'label': 'empty',
'edge_files': ['network.txt'],
'node_files': ['node-prizes.txt', 'sources.txt', 'targets.txt'],
'other_files': [],
'data_dir': FIXTURES_PATH / 'standard'
})

assert len(dataset.get_interactome()) == 2

# 372 is a PR, but for the relevant comment, see
# https://github.com/Reed-CompBio/spras/pull/372/files#r2291953612.
# Note that the input-nodes file has more tabs than the original fixture.
def test_372(self):
dataset = Dataset({
'label': 'toy-372',
'edge_files': ['input-interactome.txt'],
'node_files': ['input-nodes.txt'],
'data_dir': FIXTURES_PATH / 'toy-372',
'other_files': []
})

node_table = dataset.node_table
assert node_table is not None

assert node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['prize'] == 5.7
assert node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['active'] == True

assert np.isnan(node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['sources'])
assert node_table[node_table[Dataset.NODE_ID] == 'C'].iloc[0]['targets'] == True
Loading