Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ select = [
"F823", # undefined-local
"F841", # unused-variable
"I", # isort
"W292", # missing-newline-at-end-of-file
"W292", # missing-newline-at-end-of-file,
"PD002", # pandas-use-of-inplace-argument
]

[tool.setuptools.packages.find]
Expand Down
16 changes: 8 additions & 8 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def visualize_precision_and_recall_plot(pr_df: pd.DataFrame, output_file: str |
plt.close()

# save dataframe
pr_df.drop(columns=['Algorithm'], inplace=True)
pr_df = pr_df.drop(columns=['Algorithm'])
pr_df.to_csv(output_file, sep='\t', index=False)

@staticmethod
Expand All @@ -248,7 +248,7 @@ def precision_and_recall_per_pathway(pr_df: pd.DataFrame, output_file: str | Pat
"""
if not pr_df.empty:
pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1])
pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True)
pr_df = pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True)

if aggregate_per_algorithm:
# Guaranteed to only have one algorithm in Algorithm column
Expand Down Expand Up @@ -281,7 +281,7 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st

if not pr_df.empty:
pr_df['Algorithm'] = pr_df['Pathway'].apply(lambda p: Path(p).parent.name.split('-')[1])
pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True, inplace=True)
pr_df = pr_df.sort_values(by=['Recall', 'Pathway'], axis=0, ascending=True)

if aggregate_per_algorithm:
title = "PCA-Chosen Pathway Per Algorithm Precision and Recall Plot"
Expand All @@ -305,7 +305,7 @@ def precision_and_recall_pca_chosen_pathway(pr_df: pd.DataFrame, output_file: st
plt.close()

@staticmethod
def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathway_summary_file: str, output_dir: str):
def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathway_summary_file: str | PathLike, output_dir: str | PathLike) -> list[str]:
"""
Identifies the pathway closest to a specified highest kernel density estimated (KDE) peak based on PCA
coordinates
Expand All @@ -323,7 +323,7 @@ def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathwa
"""
# TODO update to add in the pathways for the algorithms that do not provide a pca chosen pathway https://github.com/Reed-CompBio/spras/issues/341

rep_pathways = []
rep_pathways: list[str] = []

for coordinates_file in coordinates_files:
coord_df = pd.read_csv(coordinates_file, delimiter='\t', header=0)
Expand Down Expand Up @@ -356,7 +356,7 @@ def pca_chosen_pathway(coordinates_files: Iterable[Union[str, PathLike]], pathwa
return rep_pathways

@staticmethod
def edge_frequency_node_ensemble(node_table: pd.DataFrame, ensemble_files: Iterable[Union[str, PathLike]], dataset_file: str) -> dict:
def edge_frequency_node_ensemble(node_table: pd.DataFrame, ensemble_files: Iterable[Union[str, PathLike]], dataset_file: str | PathLike) -> dict:
"""
Generates a dictionary of node ensembles using edge frequency data from a list of ensemble files.
A list of ensemble files can contain an aggregated ensemble or algorithm-specific ensembles per dataset
Expand Down Expand Up @@ -387,11 +387,11 @@ def edge_frequency_node_ensemble(node_table: pd.DataFrame, ensemble_files: Itera

if interactome.empty:
raise ValueError(
f"Cannot compute PR curve or generate node ensemble. Input network for dataset \"{dataset_file.split('-')[0]}\" is empty."
f"Cannot compute PR curve or generate node ensemble. Input network for dataset \"{Path(dataset_file).name.split('-')[0]}\" is empty."
)
if node_table.empty:
raise ValueError(
f"Cannot compute PR curve or generate node ensemble. Gold standard associated with dataset \"{dataset_file.split('-')[0]}\" is empty."
f"Cannot compute PR curve or generate node ensemble. Gold standard associated with dataset \"{Path(dataset_file).name.split('-')[0]}\" is empty."
)

# set the initial default frequencies to 0 for all interactome and gold standard nodes
Expand Down
2 changes: 1 addition & 1 deletion spras/meo.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file, params):
# Would need to load the paths output file to rank edges correctly
df = add_rank_column(df)
df = reinsert_direction_col_directed(df)
df.drop(columns=['Type', 'Oriented', 'Weight'], inplace=True)
df = df.drop(columns=['Type', 'Oriented', 'Weight'])
df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
df, has_duplicates = duplicate_edges(df)
if has_duplicates:
Expand Down
2 changes: 1 addition & 1 deletion spras/omicsintegrator1.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file, params):
df.columns = ["Edge1", "InteractionType", "Edge2"]
df = add_rank_column(df)
df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp")
df.drop(columns=['InteractionType'], inplace=True)
df = df.drop(columns=['InteractionType'])
df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
df, has_duplicates = duplicate_edges(df)
if has_duplicates:
Expand Down
6 changes: 6 additions & 0 deletions test/evaluate/expected/expected-pca-coordinates.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
datapoint_labels PC1 PC2
data-test-params-123 0.52704628 0.70710678
data-test-params-456 -1.05409255 -0.0
data-test-params-789 0.52704628 -0.70710678
centroid 0.0 -0.0
kde_peak 0.11419336 -0.00785674
116 changes: 60 additions & 56 deletions test/evaluate/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from spras.dataset import Dataset
from spras.evaluation import Evaluation

INPUT_DIR = 'test/evaluate/input/'
OUT_DIR = 'test/evaluate/output/'
EXPECT_DIR = 'test/evaluate/expected/'
GS_NODE_TABLE = pd.read_csv(INPUT_DIR + 'gs_node_table.csv', header=0)
SUMMARY_FILE = INPUT_DIR + 'example_summary.txt'
INPUT_DIR = Path('test', 'evaluate', 'input')
OUT_DIR = Path('test', 'evaluate', 'output')
EXPECT_DIR = Path('test', 'evaluate', 'expected')
GS_NODE_TABLE = pd.read_csv(INPUT_DIR / 'gs_node_table.csv', header=0)
SUMMARY_FILE = INPUT_DIR / 'example_summary.txt'


class TestEvaluate:
Expand All @@ -39,154 +39,158 @@ def setup_class(cls):
pickle.dump(dataset, f)

def test_node_precision_recall_per_pathway(self):
file_paths = [INPUT_DIR + 'data-test-params-123/pathway.txt', INPUT_DIR + 'data-test-params-456/pathway.txt', INPUT_DIR + 'data-test-params-789/pathway.txt', INPUT_DIR + 'data-test-params-empty/pathway.txt']
output_file = Path(OUT_DIR + 'pr-per-pathway.txt')
output_png = Path(OUT_DIR + 'pr-per-pathway.png')
file_paths = [INPUT_DIR / 'data-test-params-123/pathway.txt', INPUT_DIR / 'data-test-params-456/pathway.txt', INPUT_DIR / 'data-test-params-789/pathway.txt', INPUT_DIR / 'data-test-params-empty/pathway.txt']
output_file = Path(OUT_DIR, 'pr-per-pathway.txt')
output_png = Path(OUT_DIR, 'pr-per-pathway.png')
output_file.unlink(missing_ok=True)
output_png.unlink(missing_ok=True)

pr_df = Evaluation.node_precision_and_recall(file_paths, GS_NODE_TABLE)
Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png, True)

output = pd.read_csv(output_file, sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway.txt', sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR / 'expected-pr-per-pathway.txt', sep='\t', header=0).round(8)

assert output.equals(expected)
assert output_png.exists()

def test_node_precision_recall_per_pathway_empty(self):

file_paths = [INPUT_DIR + 'data-test-params-empty/pathway.txt']
output_file = Path(OUT_DIR + 'pr-per-pathway-empty.txt')
output_png = Path(OUT_DIR + 'pr-per-pathway-empty.png')
file_paths = [INPUT_DIR / 'data-test-params-empty/pathway.txt']
output_file = Path(OUT_DIR, 'pr-per-pathway-empty.txt')
output_png = Path(OUT_DIR, 'pr-per-pathway-empty.png')
output_file.unlink(missing_ok=True)
output_png.unlink(missing_ok=True)

pr_df = Evaluation.node_precision_and_recall(file_paths, GS_NODE_TABLE)
Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png, True)

output = pd.read_csv(output_file, sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-empty.txt', sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR / 'expected-pr-per-pathway-empty.txt', sep='\t', header=0).round(8)

assert output.equals(expected)
assert output_png.exists()

def test_node_precision_recall_per_pathway_not_provided(self):
output_file = OUT_DIR + 'pr-per-pathway-not-provided.txt'
output_png = OUT_DIR + 'pr-per-pathway-not-provided.png'
output_file = OUT_DIR / 'pr-per-pathway-not-provided.txt'
output_png = OUT_DIR / 'pr-per-pathway-not-provided.png'
file_paths = []

pr_df = Evaluation.node_precision_and_recall(file_paths, GS_NODE_TABLE)
with pytest.raises(ValueError):
Evaluation.precision_and_recall_per_pathway(pr_df, output_file, output_png)

def test_node_precision_recall_pca_chosen_pathway_not_provided(self):
output_file = Path( OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided.txt')
output_file = Path(OUT_DIR, 'pr-per-pathway-pca-chosen-not-provided.txt')
output_file.unlink(missing_ok=True)
output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen-not-provided.png')
output_png = Path(OUT_DIR, 'pr-per-pathway-pca-chosen-not-provided.png')
output_png.unlink(missing_ok=True)
file_paths = []

pr_df = Evaluation.node_precision_and_recall(file_paths, GS_NODE_TABLE)
Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output_file, output_png)

output = pd.read_csv(output_file, sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR + 'expected-pr-pca-chosen-not-provided.txt', sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR / 'expected-pr-pca-chosen-not-provided.txt', sep='\t', header=0).round(8)

assert output.equals(expected)
assert output_png.exists()

def test_node_precision_recall_pca_chosen_pathway(self):
output_file = Path(OUT_DIR + 'pr-per-pathway-pca-chosen.txt')
output_file = Path(OUT_DIR / 'pr-per-pathway-pca-chosen.txt')
output_file.unlink(missing_ok=True)
output_png = Path(OUT_DIR + 'pr-per-pathway-pca-chosen.png')
output_png = Path(OUT_DIR / 'pr-per-pathway-pca-chosen.png')
output_png.unlink(missing_ok=True)
output_coordinates = Path(OUT_DIR + 'pca-coordinates.tsv')
output_coordinates = Path(OUT_DIR / 'pca-coordinates.tsv')
output_coordinates.unlink(missing_ok=True)

file_paths = [INPUT_DIR + 'data-test-params-123/pathway.txt', INPUT_DIR + 'data-test-params-456/pathway.txt',
INPUT_DIR + 'data-test-params-789/pathway.txt', INPUT_DIR + 'data-test-params-empty/pathway.txt']
file_paths = [INPUT_DIR / 'data-test-params-123' / 'pathway.txt', INPUT_DIR / 'data-test-params-456' / 'pathway.txt',
INPUT_DIR / 'data-test-params-789' / 'pathway.txt', INPUT_DIR / 'data-test-params-empty' / 'pathway.txt']

dataframe = ml.summarize_networks(file_paths)
ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', str(output_coordinates), kde=True, remove_empty_pathways=True)
ml.pca(dataframe, OUT_DIR / 'pca.png', OUT_DIR / 'pca-variance.txt', output_coordinates, kde=True, remove_empty_pathways=True)

pathway = Evaluation.pca_chosen_pathway([output_coordinates], SUMMARY_FILE, INPUT_DIR)
pathways = Evaluation.pca_chosen_pathway([output_coordinates], SUMMARY_FILE, INPUT_DIR)
assert len(pathways) == 1, f"There must only be one pathway, but got {len(pathways)} instead! ({pathways})"
pd.testing.assert_frame_equal(
pd.read_csv(output_coordinates, sep='\t', header=0),
pd.read_csv(EXPECT_DIR / 'expected-pca-coordinates.txt', sep='\t', header=0)
)

pr_df = Evaluation.node_precision_and_recall(pathway, GS_NODE_TABLE)
pr_df = Evaluation.node_precision_and_recall(pathways, GS_NODE_TABLE)
Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output_file, output_png, True)


chosen = pd.read_csv(output_file, sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR + 'expected-pr-per-pathway-pca-chosen.txt', sep='\t', header=0).round(8)
expected = pd.read_csv(EXPECT_DIR / 'expected-pr-per-pathway-pca-chosen.txt', sep='\t', header=0).round(8)

assert chosen.equals(expected)
pd.testing.assert_frame_equal(chosen, expected)
assert output_png.exists()

def test_node_ensemble(self):
out_path_file = Path(OUT_DIR + 'node-ensemble.csv')
out_path_file = Path(OUT_DIR, 'node-ensemble.csv')
out_path_file.unlink(missing_ok=True)
ensemble_network = [INPUT_DIR + 'ensemble-network.tsv']
input_network = OUT_DIR + 'data.pickle'
ensemble_network = [INPUT_DIR / 'ensemble-network.tsv']
input_network = OUT_DIR / 'data.pickle'
node_ensemble_dict = Evaluation.edge_frequency_node_ensemble(GS_NODE_TABLE, ensemble_network, input_network)
node_ensemble_dict['ensemble'].to_csv(out_path_file, sep='\t', index=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-node-ensemble.csv', shallow=False)

def test_empty_node_ensemble(self):
out_path_file = Path(OUT_DIR + 'empty-node-ensemble.csv')
out_path_file = Path(OUT_DIR, 'empty-node-ensemble.csv')
out_path_file.unlink(missing_ok=True)
empty_ensemble_network = [INPUT_DIR + 'empty-ensemble-network.tsv']
input_network = OUT_DIR + 'data.pickle'
empty_ensemble_network = [INPUT_DIR / 'empty-ensemble-network.tsv']
input_network = OUT_DIR / 'data.pickle'
node_ensemble_dict = Evaluation.edge_frequency_node_ensemble(GS_NODE_TABLE, empty_ensemble_network,
input_network)
node_ensemble_dict['empty'].to_csv(out_path_file, sep='\t', index=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-empty-node-ensemble.csv', shallow=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-empty-node-ensemble.csv', shallow=False)

def test_multiple_node_ensemble(self):
out_path_file = Path(OUT_DIR + 'node-ensemble.csv')
out_path_file = Path(OUT_DIR / 'node-ensemble.csv')
out_path_file.unlink(missing_ok=True)
out_path_empty_file = Path(OUT_DIR + 'empty-node-ensemble.csv')
out_path_empty_file = Path(OUT_DIR / 'empty-node-ensemble.csv')
out_path_empty_file.unlink(missing_ok=True)
ensemble_networks = [INPUT_DIR + 'ensemble-network.tsv', INPUT_DIR + 'empty-ensemble-network.tsv']
input_network = OUT_DIR + 'data.pickle'
ensemble_networks = [INPUT_DIR / 'ensemble-network.tsv', INPUT_DIR / 'empty-ensemble-network.tsv']
input_network = OUT_DIR / 'data.pickle'
node_ensemble_dict = Evaluation.edge_frequency_node_ensemble(GS_NODE_TABLE, ensemble_networks, input_network)
node_ensemble_dict['ensemble'].to_csv(out_path_file, sep='\t', index=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-node-ensemble.csv', shallow=False)
node_ensemble_dict['empty'].to_csv(out_path_empty_file, sep='\t', index=False)
assert filecmp.cmp(out_path_empty_file, EXPECT_DIR + 'expected-empty-node-ensemble.csv', shallow=False)
assert filecmp.cmp(out_path_empty_file, EXPECT_DIR / 'expected-empty-node-ensemble.csv', shallow=False)

def test_precision_recall_curve_ensemble_nodes(self):
out_path_png = Path(OUT_DIR + 'pr-curve-ensemble-nodes.png')
out_path_png = Path(OUT_DIR, 'pr-curve-ensemble-nodes.png')
out_path_png.unlink(missing_ok=True)
out_path_file = Path(OUT_DIR + 'pr-curve-ensemble-nodes.txt')
out_path_file = Path(OUT_DIR, 'pr-curve-ensemble-nodes.txt')
out_path_file.unlink(missing_ok=True)
ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep='\t', header=0)
ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble.csv', sep='\t', header=0)
node_ensembles_dict = {'ensemble': ensemble_file}
Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, GS_NODE_TABLE, out_path_png,
out_path_file)
assert out_path_png.exists()
assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-pr-curve-ensemble-nodes.txt', shallow=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-pr-curve-ensemble-nodes.txt', shallow=False)

def test_precision_recall_curve_ensemble_nodes_empty(self):
out_path_png = Path(OUT_DIR + 'pr-curve-ensemble-nodes-empty.png')
out_path_png = Path(OUT_DIR, 'pr-curve-ensemble-nodes-empty.png')
out_path_png.unlink(missing_ok=True)
out_path_file = Path(OUT_DIR + 'pr-curve-ensemble-nodes-empty.txt')
out_path_file = Path(OUT_DIR, 'pr-curve-ensemble-nodes-empty.txt')
out_path_file.unlink(missing_ok=True)
empty_ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep='\t', header=0)
empty_ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble-empty.csv', sep='\t', header=0)
node_ensembles_dict = {'ensemble': empty_ensemble_file}
Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, GS_NODE_TABLE, out_path_png,
out_path_file)
assert out_path_png.exists()
assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-pr-curve-ensemble-nodes-empty.txt', shallow=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-pr-curve-ensemble-nodes-empty.txt', shallow=False)

def test_precision_recall_curve_multiple_ensemble_nodes(self):
out_path_png = Path(OUT_DIR + 'pr-curve-multiple-ensemble-nodes.png')
out_path_png = Path(OUT_DIR, 'pr-curve-multiple-ensemble-nodes.png')
out_path_png.unlink(missing_ok=True)
out_path_file = Path(OUT_DIR + 'pr-curve-multiple-ensemble-nodes.txt')
out_path_file = Path(OUT_DIR, 'pr-curve-multiple-ensemble-nodes.txt')
out_path_file.unlink(missing_ok=True)
ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep='\t', header=0)
empty_ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep='\t', header=0)
ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble.csv', sep='\t', header=0)
empty_ensemble_file = pd.read_csv(INPUT_DIR / 'node-ensemble-empty.csv', sep='\t', header=0)
node_ensembles_dict = {'ensemble1': ensemble_file, 'ensemble2': ensemble_file, 'ensemble3': empty_ensemble_file}
Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, GS_NODE_TABLE, out_path_png,
out_path_file, True)
assert out_path_png.exists()
assert filecmp.cmp(out_path_file, EXPECT_DIR + 'expected-pr-curve-multiple-ensemble-nodes.txt', shallow=False)
assert filecmp.cmp(out_path_file, EXPECT_DIR / 'expected-pr-curve-multiple-ensemble-nodes.txt', shallow=False)
7 changes: 0 additions & 7 deletions test/ml/expected/expected-pca-coordinates-kde-negated.tsv

This file was deleted.

Loading
Loading