Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 48 additions & 24 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@ wildcard_constraints:
# without declaration!
_config.init_global(config)

def without_keys(d: dict, keys: list):
if set(keys) & set(d.keys()) != set(keys): raise RuntimeError(f"Keys {keys} not fully present in {list(d.keys())}!")
return {k: v for k, v in d.items() if k not in keys}

out_dir = _config.config.out_dir
algorithm_params = _config.config.algorithm_params
algorithm_directed = _config.config.algorithm_directed
pca_params = _config.config.pca_params
hac_params = _config.config.hac_params
container_settings = _config.config.container_settings
include_aggregate_algo_eval = _config.config.analysis_include_evaluation_aggregate_algo
pca_params = without_keys(vars(_config.config.analysis.pca), ["pca_chosen", "include", "aggregate_per_algorithm"])
hac_params = without_keys(vars(_config.config.analysis.hac), ["include", "aggregate_per_algorithm"])

# Return the dataset or gold_standard dictionary from the config file given the label
def get_dataset(_datasets, label):
Expand Down Expand Up @@ -71,55 +74,76 @@ def write_dataset_log(dataset, logfile):
def make_final_input(wildcards):
final_input = []

if _config.config.analysis_include_summary:
if _config.config.analysis.summary.include:
# add summary output file for each pathway
# TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128
# final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
# add table summarizing all pathways for each dataset
final_input.extend(expand('{out_dir}{sep}{dataset}-pathway-summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels))

if _config.config.analysis_include_cytoscape:
if _config.config.analysis.cytoscape.include:
final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels))

if _config.config.analysis_include_ml:
if _config.config.analysis.pca.include:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_ml_aggregate_algo:
if _config.config.analysis.pca.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))

if _config.config.analysis.hac.include:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis.hac.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))

if _config.config.analysis.ensemble.include:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis.ensemble.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if _config.config.analysis.jaccard.include:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis.jaccard.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if _config.config.analysis_include_evaluation:
if _config.config.analysis.evaluation.include:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))

# dummy file
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs))
if _config.config.analysis_include_evaluation_aggregate_algo:

if _config.config.analysis.evaluation.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms))

if _config.config.analysis.pca.pca_chosen.include:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))

if _config.config.analysis.pca.pca_chosen.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))

if _config.config.analysis.ensemble.evaluation.include:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))

if _config.config.analysis.ensemble.evaluation.aggregate_per_algorithm:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs))

Expand Down Expand Up @@ -463,7 +487,7 @@ rule evaluation_per_algo_pr_per_pathways:
run:
node_table = Evaluation.from_file(input.node_gold_standard_file).node_table
pr_df = Evaluation.node_precision_and_recall(input.pathways, node_table)
Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, include_aggregate_algo_eval)
Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, _config.config.analysis.evaluation.aggregate_per_algorithm)

# Return pathway summary file per dataset
def collect_summary_statistics_per_dataset(wildcards):
Expand Down Expand Up @@ -511,7 +535,7 @@ rule evaluation_per_algo_pca_chosen:
node_table = Evaluation.from_file(input.node_gold_standard_file).node_table
pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir)
pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathways, node_table)
Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, include_aggregate_algo_eval)
Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, _config.config.analysis.pca.pca_chosen.aggregate_per_algorithm)

# Return the dataset pickle file for a specific dataset
def get_dataset_pickle_file(wildcards):
Expand Down Expand Up @@ -554,7 +578,7 @@ rule evaluation_per_algo_ensemble_pr_curve:
run:
node_table = Evaluation.from_file(input.node_gold_standard_file).node_table
node_ensembles_dict = Evaluation.edge_frequency_node_ensemble(node_table, input.ensemble_files, input.dataset_file)
Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, include_aggregate_algo_eval)
Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, _config.config.analysis.evaluation.aggregate_per_algorithm)

rule evaluation_edge_dummy:
input:
Expand Down
47 changes: 32 additions & 15 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -205,33 +205,50 @@ analysis:
# Create Cytoscape session file with all pathway graphs for each dataset
cytoscape:
include: true
# Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
ml:
# ml analysis per dataset
# The following analysis options also have an `aggregate_per_algorithm` option,
# which adds the respective analysis to an algorithm as a whole.
# This will only run if the adjacent `include` is true.

# Principle component analysis of the pathway output files
pca:
include: true
# adds ml analysis per algorithm output
# only runs for algorithms with multiple parameter combinations chosen
aggregate_per_algorithm: true
pca_chosen:
include: true
aggregate_per_algorithm: true
# specify how many principal components to calculate
components: 2
# boolean to show the labels on the pca graph
labels: true
# 'ward', 'complete', 'average', 'single'
# if linkage: ward, must use metric: euclidean
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
# controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots.
# the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file.
# KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used
# to pick the 'best' parameter combination.
kde: true
# removes empty pathways from consideration in ml analysis (pca only)
# removes empty pathways from consideration in ml analysis
remove_empty_pathways: false
# Hierarchical agglomerative clustering analysis of the pathway output files
hac:
include: true
aggregate_per_algorithm: true
# 'ward', 'complete', 'average', 'single'
# if linkage: ward, must use metric: euclidean
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
# Ensembling pathway output
ensemble:
include: true
aggregate_per_algorithm: true
evaluation:
include: true
aggregate_per_algorithm: true
# Jaccard pathway output
jaccard:
include: true
evaluation:
# evaluation per dataset-goldstandard pair
# evaluation will not run unless ml include is set to true
# evaluation per dataset-goldstandard pair.
# This evaluation specifically generates precision-recall curves:
# to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses.
include: true
# adds evaluation per algorithm per dataset-goldstandard pair
# evaluation per algorithm will not run unless ml include and ml aggregate_per_algorithm are set to true
aggregate_per_algorithm: true
25 changes: 21 additions & 4 deletions config/egfr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,33 @@ reconstruction_settings:
locations:
reconstruction_dir: output/egfr
analysis:
cytoscape:
include: true
summary:
include: true
ml:
cytoscape:
include: true
pca:
include: true
aggregate_per_algorithm: true
pca_chosen:
include: true
aggregate_per_algorithm: true
components: 2
labels: true
kde: true
remove_empty_pathways: true
evaluation:
hac:
include: true
aggregate_per_algorithm: true
linkage: 'ward'
metric: 'euclidean'
ensemble:
include: true
aggregate_per_algorithm: true
evaluation:
include: true
aggregate_per_algorithm: true
jaccard:
include: false
evaluation:
include: false
aggregate_per_algorithm: false
38 changes: 34 additions & 4 deletions docker-wrappers/SPRAS/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,18 +138,48 @@ analysis:
include: true
# Create Cytoscape session file with all pathway graphs for each dataset
cytoscape:
include: false
# Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
ml:
include: true
# The following analysis options also have an `aggregate_per_algorithm` option,
# which adds the respective analysis to an algorithm as a whole.
# This will only run if the adjacent `include` is true.

# Principle component analysis of the pathway output files
pca:
include: true
aggregate_per_algorithm: true
pca_chosen:
include: true
aggregate_per_algorithm: true
# specify how many principal components to calculate
components: 2
# boolean to show the labels on the pca graph
labels: true
# controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots.
# the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file.
# KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used
# to pick the 'best' parameter combination.
kde: true
# removes empty pathways from consideration in ml analysis
remove_empty_pathways: false
# Hierarchical agglomerative clustering analysis of the pathway output files
hac:
include: true
aggregate_per_algorithm: true
# 'ward', 'complete', 'average', 'single'
# if linkage: ward, must use metric: euclidean
linkage: 'ward'
# 'euclidean', 'manhattan', 'cosine'
metric: 'euclidean'
# Ensembling pathway output
ensemble:
include: true
aggregate_per_algorithm: true
evaluation:
include: true
aggregate_per_algorithm: true
evaluation:
include: false
# evaluation per dataset-goldstandard pair.
# This evaluation specifically generates precision-recall curves:
# to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses.
include: true
aggregate_per_algorithm: true
7 changes: 7 additions & 0 deletions docs/tutorial/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ When gold standards are provided and evaluation is enabled (``include: true``),
analysis:
evaluation:
include: true
# One could also enable
# evaluation for PCA and HAC, and ensembling.
# For example,
jaccard:
include: true
evaluation:
include: true

A gold standard dataset must include the following types of keys and files:

Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial/beginner.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ Analysis
include: true
cytoscape:
include: true
ml:
pca:
include: true


Expand Down
Loading
Loading