Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import yaml
from spras.dataset import Dataset
from spras.evaluation import Evaluation
from spras.analysis import ml, summary, cytoscape
from spras.attribution import attribute_algorithms
import spras.config.config as _config

# Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
Expand Down Expand Up @@ -126,6 +127,10 @@ def make_final_input(wildcards):
# Since (formatted) pathway files are interesting to the user, we preserve them.
final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, dataset=dataset_labels, algorithm_params=algorithms_with_params))

if _config.config.analysis_include_attribution:
final_input.extend(expand('{out_dir}{sep}attribution/{algorithm}.bib', out_dir=out_dir, sep=SEP, algorithm=algorithms))
final_input.extend(expand('{out_dir}{sep}attribution/all.bib', out_dir=out_dir, sep=SEP, ))

# Create log files for the parameters and datasets
final_input.extend(expand('{out_dir}{sep}logs{sep}parameters-{algorithm_params}.yaml', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}logs{sep}datasets-{dataset}.yaml', out_dir=out_dir, sep=SEP, dataset=dataset_labels))
Expand Down Expand Up @@ -407,7 +412,7 @@ rule ensemble_per_algo:
# Calculated Jaccard similarity between output pathways for each dataset per algorithm
rule jaccard_similarity_per_algo:
input:
pathways = collect_pathways_per_algo
pathways = collect_pathways_per_algo
output:
jaccard_similarity_matrix = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-jaccard-matrix.txt']),
jaccard_similarity_heatmap = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-jaccard-heatmap.png'])
Expand Down Expand Up @@ -567,6 +572,13 @@ rule evaluation_edge_dummy:
directed_edge_table = Evaluation.from_file(input.edge_gold_standard_file).directed_edge_table
Evaluation.edge_dummy_function(mixed_edge_table, undirected_edge_table, directed_edge_table, output.dummy_file)

rule attribution:
output:
attribution_all = SEP.join([out_dir, 'attribution', 'all.bib']),
attribution_algorithms = expand('{out_dir}{sep}attribution{sep}{algorithms}.bib', out_dir=out_dir, sep=SEP, algorithms=algorithms),
run:
attribute_algorithms(output.attribution_all, output.attribution_algorithms)

# Remove the output directory
rule clean:
shell: f'rm -rf {out_dir}'
4 changes: 4 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,7 @@ analysis:
# adds evaluation per algorithm per dataset-goldstandard pair
# evaluation per algorithm will not run unless ml include and ml aggregate_per_algorithm are set to true
aggregate_per_algorithm: true
attribution:
# Include generated bibtex citations at OUT_DIR/attribution/*.bib, for every included algorithm,
# including an aggregated attribution/all.bib
include: true
4 changes: 4 additions & 0 deletions docs/contributing/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,10 @@ Modify parse outputs:
``test/parse-outputs/test_parse_outputs.py``, with any parameters it
needs.

Finally, modify ``test/attribution`` to have an empty
``localneighborhood.bib`` file, indicating that
LocalNeighborhood cites nothing else.

Step 6: Update documentation
----------------------------

Expand Down
39 changes: 39 additions & 0 deletions spras/attribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import urllib.parse
from pathlib import Path

import requests

from spras.runner import algorithms

DOI_BASE = "https://citation.doi.org/format?style=bibtex&lang=en-US&doi="

def format_request(doi: str) -> str:
return DOI_BASE + urllib.parse.quote(doi)

def get_bibtex(doi: str) -> str:
response = requests.get(format_request(doi))

return response.text.strip()

def attribute_algorithms(all_file: str, alg_files: list[str]):
"""
Attributes all algorithms specified by alg_files, aggregating them in
all_file.
"""
algorithm_name_files = [(Path(file).stem, file) for file in alg_files]

algorithm_citations = [
(file, [get_bibtex(doi) for doi in algorithms[name].dois]) for (name, file) in algorithm_name_files
]

for alg_output, alg_citations in algorithm_citations:
Path(alg_output).parent.mkdir(parents=True, exist_ok=True)
with open(alg_output, '+w') as handle:
for citation in alg_citations:
handle.write(citation + '\n')

Path(all_file).parent.mkdir(parents=True, exist_ok=True)
with open(all_file, '+w') as handle:
for _, alg_citations in algorithm_citations:
for citation in alg_citations:
handle.write(citation + '\n')
3 changes: 3 additions & 0 deletions spras/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ def __init__(self, raw_config: dict[str, Any]):
self.analysis_include_ml = None
# A Boolean specifying whether to run the Evaluation analysis
self.analysis_include_evaluation = None
# A Boolean specifying whether to run the attribution postprocessing
self.analysis_include_attribution = None
# A Boolean specifying whether to run the ML per algorithm analysis
self.analysis_include_ml_aggregate_algo = None
# A Boolean specifying whether to run the evaluation per algorithm analysis
Expand Down Expand Up @@ -249,6 +251,7 @@ def process_analysis(self, raw_config: RawConfig):
self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include
self.analysis_include_ml = raw_config.analysis.ml.include
self.analysis_include_evaluation = raw_config.analysis.evaluation.include
self.analysis_include_attribution = raw_config.analysis.attribution.include

# Only run ML aggregate per algorithm if analysis include ML is set to True
if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml:
Expand Down
6 changes: 6 additions & 0 deletions spras/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,17 @@ class EvaluationAnalysis(BaseModel):

model_config = ConfigDict(extra='forbid')

class Attribution(BaseModel):
include: bool

model_config = ConfigDict(extra='forbid')

class Analysis(BaseModel):
summary: SummaryAnalysis = SummaryAnalysis(include=False)
cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False)
ml: MlAnalysis = MlAnalysis(include=False)
evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False)
attribution: Attribution = Attribution(include=False)

model_config = ConfigDict(extra='forbid')

Expand Down
2 changes: 2 additions & 0 deletions test/analysis/input/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,5 @@ analysis:
metric: 'euclidean'
evaluation:
include: false
attribution:
include: false
2 changes: 2 additions & 0 deletions test/analysis/input/egfr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,5 @@ analysis:
include: false
evaluation:
include: false
attribution:
include: false
9 changes: 9 additions & 0 deletions test/attribution/expected/all.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
@article{Supper_Spangenberg_Planatscher_Dräger_Schröder_Zell_2009, title={BowTieBuilder: modeling signal transduction pathways}, volume={3}, url={http://dx.doi.org/10.1186/1752-0509-3-67}, DOI={10.1186/1752-0509-3-67}, number={1}, journal={BMC Systems Biology}, publisher={Springer Science and Business Media LLC}, author={Supper, Jochen and Spangenberg, Lucía and Planatscher, Hannes and Dräger, Andreas and Schröder, Adrian and Zell, Andreas}, year={2009}, month=june, language={en} }
@article{Levi_Elkon_Shamir_2021, title={DOMINO: a network‐based active module identification algorithm with reduced rate of false calls}, volume={17}, url={http://dx.doi.org/10.15252/msb.20209593}, DOI={10.15252/msb.20209593}, number={1}, journal={Molecular Systems Biology}, publisher={Springer Science and Business Media LLC}, author={Levi, Hagai and Elkon, Ran and Shamir, Ron}, year={2021}, month=jan, language={en} }
@article{Gitter_Klein-Seetharaman_Gupta_Bar-Joseph_2010, title={Discovering pathways by orienting edges in protein interaction networks}, volume={39}, url={http://dx.doi.org/10.1093/nar/gkq1207}, DOI={10.1093/nar/gkq1207}, number={4}, journal={Nucleic Acids Research}, publisher={Oxford University Press (OUP)}, author={Gitter, Anthony and Klein-Seetharaman, Judith and Gupta, Anupam and Bar-Joseph, Ziv}, year={2010}, month=nov, pages={e22–e22}, language={en} }
@article{Yeger-Lotem_Riva_Su_Gitler_Cashikar_King_Auluck_Geddie_Valastyan_Karger_et al._2009, title={Bridging high-throughput genetic and transcriptional data reveals cellular responses to alpha-synuclein toxicity}, volume={41}, url={http://dx.doi.org/10.1038/ng.337}, DOI={10.1038/ng.337}, number={3}, journal={Nature Genetics}, publisher={Springer Science and Business Media LLC}, author={Yeger-Lotem, Esti and Riva, Laura and Su, Linhui Julie and Gitler, Aaron D and Cashikar, Anil G and King, Oliver D and Auluck, Pavan K and Geddie, Melissa L and Valastyan, Julie S and Karger, David R and Lindquist, Susan and Fraenkel, Ernest}, year={2009}, month=feb, pages={316–323}, language={en} }
@article{Tuncbag_Gosline_Kedaigle_Soltis_Gitter_Fraenkel_2016, title={Network-Based Interpretation of Diverse High-Throughput Datasets through the Omics Integrator Software Package}, volume={12}, url={http://dx.doi.org/10.1371/journal.pcbi.1004879}, DOI={10.1371/journal.pcbi.1004879}, number={4}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Tuncbag, Nurcan and Gosline, Sara J. C. and Kedaigle, Amanda and Soltis, Anthony R. and Gitter, Anthony and Fraenkel, Ernest}, editor={Prlic, Andreas}, year={2016}, month=apr, pages={e1004879}, language={en} }
@article{Tuncbag_Gosline_Kedaigle_Soltis_Gitter_Fraenkel_2016, title={Network-Based Interpretation of Diverse High-Throughput Datasets through the Omics Integrator Software Package}, volume={12}, url={http://dx.doi.org/10.1371/journal.pcbi.1004879}, DOI={10.1371/journal.pcbi.1004879}, number={4}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Tuncbag, Nurcan and Gosline, Sara J. C. and Kedaigle, Amanda and Soltis, Anthony R. and Gitter, Anthony and Fraenkel, Ernest}, editor={Prlic, Andreas}, year={2016}, month=apr, pages={e1004879}, language={en} }
@article{Ritz_Poirel_Tegge_Sharp_Simmons_Powell_Kale_Murali_2016, title={Pathways on demand: automated reconstruction of human signaling networks}, volume={2}, url={http://dx.doi.org/10.1038/npjsba.2016.2}, DOI={10.1038/npjsba.2016.2}, abstractNote={<jats:title>Abstract</jats:title><jats:p>Signaling pathways are a cornerstone of systems biology. Several databases store high-quality representations of these pathways that are amenable for automated analyses. Despite painstaking and manual curation, these databases remain incomplete. We present P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc>, a new computational method to reconstruct the interactions in a signaling pathway of interest. P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> efficiently computes multiple short paths from the receptors to transcriptional regulators (TRs) in a pathway within a background protein interaction network. We use P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> to accurately reconstruct a comprehensive set of signaling pathways from the NetPath and KEGG databases. We show that P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> has higher precision and recall than several state-of-the-art algorithms, while also ensuring that the resulting network connects receptor proteins to TRs. P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc>’s reconstruction of the Wnt pathway identified CFTR, an ABC class chloride ion channel transporter, as a novel intermediary that facilitates the signaling of Ryk to Dab2, which are known components of Wnt/β-catenin signaling. In HEK293 cells, we show that the Ryk–CFTR–Dab2 path is a novel amplifier of β-catenin signaling specifically in response to Wnt 1, 2, 3, and 3a of the 11 Wnts tested. P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> captures the structure of signaling pathways as represented in pathway databases better than existing methods. P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc>’s success in reconstructing pathways from NetPath and KEGG databases point to its applicability for complementing manual curation of these databases. P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> may serve as a promising approach for prioritizing proteins and interactions for experimental study, as illustrated by its discovery of a novel pathway in Wnt/β-catenin signaling. Our supplementary website at <jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://bioinformatics.cs.vt.edu/~murali/supplements/2016-sys-bio-applications-pathlinker/">http://bioinformatics.cs.vt.edu/~murali/supplements/2016-sys-bio-applications-pathlinker/</jats:ext-link> provides links to the P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> software, input datasets, P<jats:sc>ATH</jats:sc>L<jats:sc>INKER</jats:sc> reconstructions of NetPath pathways, and links to interactive visualizations of these reconstructions on GraphSpace.</jats:p>}, number={1}, journal={npj Systems Biology and Applications}, publisher={Springer Science and Business Media LLC}, author={Ritz, Anna and Poirel, Christopher L and Tegge, Allison N and Sharp, Nicholas and Simmons, Kelsey and Powell, Allison and Kale, Shiv D and Murali, TM}, year={2016}, month=mar, language={en} }
@article{Poirel_Rodrigues_Chen_Tyson_Murali_2013, title={Top-Down Network Analysis to Drive Bottom-Up Modeling of Physiological Processes}, volume={20}, url={http://dx.doi.org/10.1089/cmb.2012.0274}, DOI={10.1089/cmb.2012.0274}, number={5}, journal={Journal of Computational Biology}, publisher={Mary Ann Liebert Inc}, author={Poirel, Christopher L. and Rodrigues, Richard R. and Chen, Katherine C. and Tyson, John J. and Murali, T.M.}, year={2013}, month=may, pages={409–418}, language={en} }
@article{Yeger-Lotem_Riva_Su_Gitler_Cashikar_King_Auluck_Geddie_Valastyan_Karger_et al._2009, title={Bridging high-throughput genetic and transcriptional data reveals cellular responses to alpha-synuclein toxicity}, volume={41}, url={http://dx.doi.org/10.1038/ng.337}, DOI={10.1038/ng.337}, number={3}, journal={Nature Genetics}, publisher={Springer Science and Business Media LLC}, author={Yeger-Lotem, Esti and Riva, Laura and Su, Linhui Julie and Gitler, Aaron D and Cashikar, Anil G and King, Oliver D and Auluck, Pavan K and Geddie, Melissa L and Valastyan, Julie S and Karger, David R and Lindquist, Susan and Fraenkel, Ernest}, year={2009}, month=feb, pages={316–323}, language={en} }
Empty file.
1 change: 1 addition & 0 deletions test/attribution/expected/bowtiebuilder.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@article{Supper_Spangenberg_Planatscher_Dräger_Schröder_Zell_2009, title={BowTieBuilder: modeling signal transduction pathways}, volume={3}, url={http://dx.doi.org/10.1186/1752-0509-3-67}, DOI={10.1186/1752-0509-3-67}, number={1}, journal={BMC Systems Biology}, publisher={Springer Science and Business Media LLC}, author={Supper, Jochen and Spangenberg, Lucía and Planatscher, Hannes and Dräger, Andreas and Schröder, Adrian and Zell, Andreas}, year={2009}, month=june, language={en} }
1 change: 1 addition & 0 deletions test/attribution/expected/domino.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@article{Levi_Elkon_Shamir_2021, title={DOMINO: a network‐based active module identification algorithm with reduced rate of false calls}, volume={17}, url={http://dx.doi.org/10.15252/msb.20209593}, DOI={10.15252/msb.20209593}, number={1}, journal={Molecular Systems Biology}, publisher={Springer Science and Business Media LLC}, author={Levi, Hagai and Elkon, Ran and Shamir, Ron}, year={2021}, month=jan, language={en} }
1 change: 1 addition & 0 deletions test/attribution/expected/meo.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@article{Gitter_Klein-Seetharaman_Gupta_Bar-Joseph_2010, title={Discovering pathways by orienting edges in protein interaction networks}, volume={39}, url={http://dx.doi.org/10.1093/nar/gkq1207}, DOI={10.1093/nar/gkq1207}, number={4}, journal={Nucleic Acids Research}, publisher={Oxford University Press (OUP)}, author={Gitter, Anthony and Klein-Seetharaman, Judith and Gupta, Anupam and Bar-Joseph, Ziv}, year={2010}, month=nov, pages={e22–e22}, language={en} }
1 change: 1 addition & 0 deletions test/attribution/expected/mincostflow.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@article{Yeger-Lotem_Riva_Su_Gitler_Cashikar_King_Auluck_Geddie_Valastyan_Karger_et al._2009, title={Bridging high-throughput genetic and transcriptional data reveals cellular responses to alpha-synuclein toxicity}, volume={41}, url={http://dx.doi.org/10.1038/ng.337}, DOI={10.1038/ng.337}, number={3}, journal={Nature Genetics}, publisher={Springer Science and Business Media LLC}, author={Yeger-Lotem, Esti and Riva, Laura and Su, Linhui Julie and Gitler, Aaron D and Cashikar, Anil G and King, Oliver D and Auluck, Pavan K and Geddie, Melissa L and Valastyan, Julie S and Karger, David R and Lindquist, Susan and Fraenkel, Ernest}, year={2009}, month=feb, pages={316–323}, language={en} }
1 change: 1 addition & 0 deletions test/attribution/expected/omicsintegrator1.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@article{Tuncbag_Gosline_Kedaigle_Soltis_Gitter_Fraenkel_2016, title={Network-Based Interpretation of Diverse High-Throughput Datasets through the Omics Integrator Software Package}, volume={12}, url={http://dx.doi.org/10.1371/journal.pcbi.1004879}, DOI={10.1371/journal.pcbi.1004879}, number={4}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Tuncbag, Nurcan and Gosline, Sara J. C. and Kedaigle, Amanda and Soltis, Anthony R. and Gitter, Anthony and Fraenkel, Ernest}, editor={Prlic, Andreas}, year={2016}, month=apr, pages={e1004879}, language={en} }
1 change: 1 addition & 0 deletions test/attribution/expected/omicsintegrator2.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@article{Tuncbag_Gosline_Kedaigle_Soltis_Gitter_Fraenkel_2016, title={Network-Based Interpretation of Diverse High-Throughput Datasets through the Omics Integrator Software Package}, volume={12}, url={http://dx.doi.org/10.1371/journal.pcbi.1004879}, DOI={10.1371/journal.pcbi.1004879}, number={4}, journal={PLOS Computational Biology}, publisher={Public Library of Science (PLoS)}, author={Tuncbag, Nurcan and Gosline, Sara J. C. and Kedaigle, Amanda and Soltis, Anthony R. and Gitter, Anthony and Fraenkel, Ernest}, editor={Prlic, Andreas}, year={2016}, month=apr, pages={e1004879}, language={en} }
Loading
Loading