diff --git a/Snakefile b/Snakefile index cf075b0fa..6ff4d73a9 100644 --- a/Snakefile +++ b/Snakefile @@ -20,12 +20,15 @@ wildcard_constraints: # without declaration! _config.init_global(config) +def without_keys(d: dict, keys: list): + if set(keys) & set(d.keys()) != set(keys): raise RuntimeError(f"Keys {keys} not fully present in {list(d.keys())}!") + return {k: v for k, v in d.items() if k not in keys} + out_dir = _config.config.out_dir algorithm_params = _config.config.algorithm_params -pca_params = _config.config.pca_params -hac_params = _config.config.hac_params container_settings = _config.config.container_settings -include_aggregate_algo_eval = _config.config.analysis_include_evaluation_aggregate_algo +pca_params = without_keys(vars(_config.config.analysis.pca), ["pca_chosen", "include", "aggregate_per_algorithm"]) +hac_params = without_keys(vars(_config.config.analysis.hac), ["include", "aggregate_per_algorithm"]) # Return the dataset or gold_standard dictionary from the config file given the label def get_dataset(_datasets, label): @@ -70,55 +73,76 @@ def write_dataset_log(dataset, logfile): def make_final_input(wildcards): final_input = [] - if _config.config.analysis_include_summary: + if _config.config.analysis.summary.include: # add summary output file for each pathway # TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128 # final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) # add table summarizing all pathways for each dataset final_input.extend(expand('{out_dir}{sep}{dataset}-pathway-summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels)) - if _config.config.analysis_include_cytoscape: + if _config.config.analysis.cytoscape.include: final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels)) - if _config.config.analysis_include_ml: + if _config.config.analysis.pca.include: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - if _config.config.analysis_include_ml_aggregate_algo: + if _config.config.analysis.pca.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + + if _config.config.analysis.hac.include: + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis.hac.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + + if _config.config.analysis.ensemble.include: + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis.ensemble.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) + + if _config.config.analysis.jaccard.include: + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis.jaccard.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) - if _config.config.analysis_include_evaluation: + if _config.config.analysis.evaluation.include: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + # dummy file final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) - - if _config.config.analysis_include_evaluation_aggregate_algo: + + if _config.config.analysis.evaluation.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) + + if _config.config.analysis.pca.pca_chosen.include: + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + + if _config.config.analysis.pca.pca_chosen.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + + if _config.config.analysis.ensemble.evaluation.include: + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + + if _config.config.analysis.ensemble.evaluation.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) @@ -458,7 +482,7 @@ rule evaluation_per_algo_pr_per_pathways: run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table pr_df = Evaluation.node_precision_and_recall(input.pathways, node_table) - Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, include_aggregate_algo_eval) + Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, _config.config.analysis.evaluation.aggregate_per_algorithm) # Return pathway summary file per dataset def collect_summary_statistics_per_dataset(wildcards): @@ -506,7 +530,7 @@ rule evaluation_per_algo_pca_chosen: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir) pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathways, node_table) - Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, include_aggregate_algo_eval) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, _config.config.analysis.pca.pca_chosen.aggregate_per_algorithm) # Return the dataset pickle file for a specific dataset def get_dataset_pickle_file(wildcards): @@ -549,7 +573,7 @@ rule evaluation_per_algo_ensemble_pr_curve: run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table node_ensembles_dict = Evaluation.edge_frequency_node_ensemble(node_table, input.ensemble_files, input.dataset_file) - Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, include_aggregate_algo_eval) + Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, _config.config.analysis.evaluation.aggregate_per_algorithm) rule evaluation_edge_dummy: input: diff --git a/config/config.yaml b/config/config.yaml index f2899fb9a..71564424f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -204,33 +204,50 @@ analysis: # Create Cytoscape session file with all pathway graphs for each dataset cytoscape: include: true - # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset - ml: - # ml analysis per dataset + # The following analysis options also have an `aggregate_per_algorithm` option, + # which adds the respective analysis to an algorithm as a whole. + # This will only run if the adjacent `include` is true. + + # Principle component analysis of the pathway output files + pca: include: true - # adds ml analysis per algorithm output - # only runs for algorithms with multiple parameter combinations chosen aggregate_per_algorithm: true + pca_chosen: + include: true + aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph labels: true - # 'ward', 'complete', 'average', 'single' - # if linkage: ward, must use metric: euclidean - linkage: 'ward' - # 'euclidean', 'manhattan', 'cosine' - metric: 'euclidean' # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots. # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file. # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used # to pick the 'best' parameter combination. kde: true - # removes empty pathways from consideration in ml analysis (pca only) + # removes empty pathways from consideration in ml analysis remove_empty_pathways: false + # Hierarchical agglomerative clustering analysis of the pathway output files + hac: + include: true + aggregate_per_algorithm: true + # 'ward', 'complete', 'average', 'single' + # if linkage: ward, must use metric: euclidean + linkage: 'ward' + # 'euclidean', 'manhattan', 'cosine' + metric: 'euclidean' + # Ensembling pathway output + ensemble: + include: true + aggregate_per_algorithm: true + evaluation: + include: true + aggregate_per_algorithm: true + # Jaccard pathway output + jaccard: + include: true evaluation: - # evaluation per dataset-goldstandard pair - # evaluation will not run unless ml include is set to true + # evaluation per dataset-goldstandard pair. + # This evaluation specifically generates precision-recall curves: + # to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses. include: true - # adds evaluation per algorithm per dataset-goldstandard pair - # evaluation per algorithm will not run unless ml include and ml aggregate_per_algorithm are set to true aggregate_per_algorithm: true diff --git a/config/egfr.yaml b/config/egfr.yaml index b93c593c4..8c6d2b76b 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -124,16 +124,33 @@ reconstruction_settings: locations: reconstruction_dir: output/egfr analysis: - cytoscape: - include: true summary: include: true - ml: + cytoscape: + include: true + pca: include: true aggregate_per_algorithm: true + pca_chosen: + include: true + aggregate_per_algorithm: true + components: 2 labels: true kde: true remove_empty_pathways: true - evaluation: + hac: include: true aggregate_per_algorithm: true + linkage: 'ward' + metric: 'euclidean' + ensemble: + include: true + aggregate_per_algorithm: true + evaluation: + include: true + aggregate_per_algorithm: true + jaccard: + include: false + evaluation: + include: false + aggregate_per_algorithm: false diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index 1e7fd69c2..adf044d55 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -137,18 +137,48 @@ analysis: include: true # Create Cytoscape session file with all pathway graphs for each dataset cytoscape: - include: false - # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset - ml: include: true + # The following analysis options also have an `aggregate_per_algorithm` option, + # which adds the respective analysis to an algorithm as a whole. + # This will only run if the adjacent `include` is true. + + # Principle component analysis of the pathway output files + pca: + include: true + aggregate_per_algorithm: true + pca_chosen: + include: true + aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph labels: true + # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots. + # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file. + # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used + # to pick the 'best' parameter combination. + kde: true + # removes empty pathways from consideration in ml analysis + remove_empty_pathways: false + # Hierarchical agglomerative clustering analysis of the pathway output files + hac: + include: true + aggregate_per_algorithm: true # 'ward', 'complete', 'average', 'single' # if linkage: ward, must use metric: euclidean linkage: 'ward' # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' + # Ensembling pathway output + ensemble: + include: true + aggregate_per_algorithm: true + evaluation: + include: true + aggregate_per_algorithm: true evaluation: - include: false + # evaluation per dataset-goldstandard pair. + # This evaluation specifically generates precision-recall curves: + # to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses. + include: true + aggregate_per_algorithm: true diff --git a/docs/tutorial/advanced.rst b/docs/tutorial/advanced.rst index 8f7e8b645..569733631 100644 --- a/docs/tutorial/advanced.rst +++ b/docs/tutorial/advanced.rst @@ -106,6 +106,13 @@ When gold standards are provided and evaluation is enabled (``include: true``), analysis: evaluation: include: true + # One could also enable + # evaluation for PCA and HAC, and ensembling. + # For example, + jaccard: + include: true + evaluation: + include: true A gold standard dataset must include the following types of keys and files: diff --git a/docs/tutorial/beginner.rst b/docs/tutorial/beginner.rst index 9c8f7f236..43e265a49 100644 --- a/docs/tutorial/beginner.rst +++ b/docs/tutorial/beginner.rst @@ -199,7 +199,7 @@ Analysis include: true cytoscape: include: true - ml: + pca: include: true diff --git a/docs/tutorial/intermediate.rst b/docs/tutorial/intermediate.rst index 2e569e092..e39ab70cd 100644 --- a/docs/tutorial/intermediate.rst +++ b/docs/tutorial/intermediate.rst @@ -689,25 +689,31 @@ And the file ``egfr-omicsintegrator1-params-GUMLBDZ/pathway.txt`` contains the f MRE11_HUMAN RAD50_HUMAN 1 U -Step 3: Use ML post-analysis +Step 3: Use ML-related post-analysis ============================= -3.1 Adding ML post-analysis to the intermediate configuration +3.1 Adding ML-related post-analysis to the intermediate configuration ------------------------------------------------------------- -To enable the ML analysis, update the analysis section in your configuration file by setting ml to true. +To enable ML-related analysis, update the analysis section in your configuration file by setting your desired ML analyses to true. Your analysis section in the configuration file should look like this: .. code-block:: yaml analysis: - ml: + pca: + include: true + hac: + include: true + ensembling: + include: true + jaccard: include: true ... (other parameters preset) -``ml`` will perform unsupervised analyses such as principal component analysis (PCA), hierarchical agglomerative clustering (HAC), ensembling, and jaccard similarity comparisons of the pathways. +These settings will perform principal component analysis (PCA), hierarchical agglomerative clustering (HAC), ensembling, and jaccard similarity comparisons of the pathways, respectively. -- The ``ml`` section includes configurable parameters that let you adjust the behavior of the analyses performed. +- These sections includes configurable parameters that let you adjust the behavior of the analyses performed. With these updates, SPRAS will run the full set of unsupervised machine learning analyses across all outputs for a given dataset. diff --git a/spras/config/config.py b/spras/config/config.py index e180183cc..3f5803f7c 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -77,29 +77,7 @@ def __init__(self, raw_config: dict[str, Any]): # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() # A dict with the analysis settings - self.analysis_params = parsed_raw_config.analysis - # A dict with the evaluation settings - self.evaluation_params = self.analysis_params.evaluation - # A dict with the ML settings - self.ml_params = self.analysis_params.ml - # A Boolean specifying whether to run ML analysis for individual algorithms - self.analysis_include_ml_aggregate_algo = None - # A dict with the PCA settings - self.pca_params = None - # A dict with the hierarchical clustering settings - self.hac_params = None - # A Boolean specifying whether to run the summary analysis - self.analysis_include_summary = None - # A Boolean specifying whether to run the Cytoscape analysis - self.analysis_include_cytoscape = None - # A Boolean specifying whether to run the ML analysis - self.analysis_include_ml = None - # A Boolean specifying whether to run the Evaluation analysis - self.analysis_include_evaluation = None - # A Boolean specifying whether to run the ML per algorithm analysis - self.analysis_include_ml_aggregate_algo = None - # A Boolean specifying whether to run the evaluation per algorithm analysis - self.analysis_include_evaluation_aggregate_algo = None + self.analysis = parsed_raw_config.analysis self.process_config(parsed_raw_config) @@ -146,7 +124,7 @@ def process_algorithms(self, raw_config: RawConfig): Keys in the parameter dictionary are strings """ prior_params_hashes = set() - self.algorithm_params = dict() + self.algorithm_params: dict[str, Any] = dict() self.algorithms = raw_config.algorithms for alg in self.algorithms: if alg.include: @@ -197,67 +175,12 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_params[alg.name][params_hash] = run_dict - def process_analysis(self, raw_config: RawConfig): - if not raw_config.analysis: - return - - # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = { - "components": self.ml_params.components, - "labels": self.ml_params.labels, - "kde": self.ml_params.kde, - "remove_empty_pathways": self.ml_params.remove_empty_pathways - } - - self.hac_params = { - "linkage": self.ml_params.linkage, - "metric": self.ml_params.metric - } - - self.analysis_include_summary = raw_config.analysis.summary.include - self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include - self.analysis_include_ml = raw_config.analysis.ml.include - self.analysis_include_evaluation = raw_config.analysis.evaluation.include - - # Only run ML aggregate per algorithm if analysis include ML is set to True - if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: - self.analysis_include_ml_aggregate_algo = raw_config.analysis.ml.aggregate_per_algorithm - else: - self.analysis_include_ml_aggregate_algo = False - + def process_analysis(self): # Raises an error if Evaluation is enabled but no gold standard data is provided - if self.gold_standards == {} and self.analysis_include_evaluation: + if self.gold_standards == {} and self.analysis.evaluation.include: raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " "Please set evaluation include to false or provide gold standard data.") - # Only run Evaluation if ML is set to True - if not self.analysis_include_ml: - self.analysis_include_evaluation = False - - # Only run Evaluation aggregate per algorithm if analysis include ML is set to True - if self.evaluation_params.aggregate_per_algorithm and self.analysis_include_evaluation: - self.analysis_include_evaluation_aggregate_algo = raw_config.analysis.evaluation.aggregate_per_algorithm - else: - self.analysis_include_evaluation_aggregate_algo = False - - # Only run Evaluation per algorithm if ML per algorithm is set to True - if not self.analysis_include_ml_aggregate_algo: - self.analysis_include_evaluation_aggregate_algo = False - - # Set kde to True if Evaluation is set to True - # When Evaluation is True, PCA is used to pick a single parameter combination for all algorithms with multiple - # parameter combinations and KDE is used to choose the parameter combination in the PC space - if self.analysis_include_evaluation and not self.pca_params["kde"]: - self.pca_params["kde"] = True - print("Setting kde to true; Evaluation analysis needs to run KDE for PCA-Chosen parameter selection.") - - # Set summary include to True if Evaluation is set to True - # When a PCA-chosen parameter set is chosen, summary statistics are used to resolve tiebreakers. - if self.analysis_include_evaluation and not self.analysis_include_summary: - self.analysis_include_summary = True - print("Setting summary include to true; Evaluation analysis needs to use summary statistics for PCA-Chosen parameter selection.") - - def process_config(self, raw_config: RawConfig): self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir @@ -266,4 +189,4 @@ def process_config(self, raw_config: RawConfig): self.process_datasets(raw_config) self.process_algorithms(raw_config) - self.process_analysis(raw_config) + self.process_analysis() diff --git a/spras/config/schema.py b/spras/config/schema.py index e530c5d65..8c6b853d3 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -11,9 +11,10 @@ """ import re +import warnings from typing import Annotated -from pydantic import AfterValidator, BaseModel, ConfigDict +from pydantic import AfterValidator, BaseModel, ConfigDict, model_validator from spras.config.algorithms import AlgorithmUnion from spras.config.container_schema import ContainerSettings @@ -38,42 +39,74 @@ class CytoscapeAnalysis(BaseModel): # Note that CaseInsensitiveEnum is not pydantic: pydantic # has special support for enums, but we avoid the # pydantic-specific "model_config" key here for this reason. -class MlLinkage(CaseInsensitiveEnum): +class HacLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' average = 'average' single = 'single' -class MlMetric(CaseInsensitiveEnum): +class HacMetric(CaseInsensitiveEnum): euclidean = 'euclidean' manhattan = 'manhattan' cosine = 'cosine' -class MlAnalysis(BaseModel): +def implies(source: bool, target: bool, source_str: str, target_str: str): + if target and not source: + warnings.warn(f"{source_str} is False but {target_str} is True; setting {target_str} to False", stacklevel=2) + return False + return target + +class AggregateAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool = False + + model_config = ConfigDict(extra='forbid') + + @model_validator(mode='after') + def check_aggregate_when_include(self): + self.aggregate_per_algorithm = implies(self.include, self.aggregate_per_algorithm, "include", "aggregate_per_algorithm") + return self + +class EvaluationAnalysis(AggregateAnalysis): pass + +class PcaAnalysis(AggregateAnalysis): components: int = 2 labels: bool = True kde: bool = False remove_empty_pathways: bool = False - linkage: MlLinkage = MlLinkage.ward - metric: MlMetric = MlMetric.euclidean + pca_chosen: EvaluationAnalysis = EvaluationAnalysis(include=False) - model_config = ConfigDict(extra='forbid') + @model_validator(mode='after') + def check_include_when_evaluation_include(self): + self.pca_chosen.include = implies(self.include, self.pca_chosen.include, "include", "pca_chosen.include") + self.pca_chosen.aggregate_per_algorithm = implies(self.aggregate_per_algorithm, self.pca_chosen.aggregate_per_algorithm, "aggregate_per_algorithm", "pca_chosen.aggregate_per_algorithm") + return self -class EvaluationAnalysis(BaseModel): - include: bool - aggregate_per_algorithm: bool = False +class HacAnalysis(AggregateAnalysis): + linkage: HacLinkage = HacLinkage.ward + metric: HacMetric = HacMetric.euclidean - model_config = ConfigDict(extra='forbid') +class EnsembleAnalysis(AggregateAnalysis): + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + @model_validator(mode='after') + def check_include_when_evaluation_include(self): + self.evaluation.include = implies(self.include, self.evaluation.include, "include", "evaluation.include") + self.evaluation.aggregate_per_algorithm = implies(self.aggregate_per_algorithm, self.evaluation.aggregate_per_algorithm, "aggregate_per_algorithm", "evaluation.aggregate_per_algorithm") + return self +class JaccardAnalysis(AggregateAnalysis): pass class Analysis(BaseModel): summary: SummaryAnalysis = SummaryAnalysis(include=False) cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) - ml: MlAnalysis = MlAnalysis(include=False) + pca: PcaAnalysis = PcaAnalysis(include=False) + hac: HacAnalysis = HacAnalysis(include=False) + jaccard: JaccardAnalysis = JaccardAnalysis(include=False) + ensemble: EnsembleAnalysis = EnsembleAnalysis(include=False) evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + """Enables PR curve evaluation.""" - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) # The default length of the truncated hash used to identify parameter combinations diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index f16c1dbc7..b6bcf6b44 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -94,21 +94,47 @@ analysis: # Create Cytoscape session file with all pathway graphs for each dataset cytoscape: include: true - # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset - ml: - # ml analysis per dataset + # The following analysis options also have an `aggregate_per_algorithm` option, + # which adds the respective analysis to an algorithm as a whole. + # This will only run if the adjacent `include` is true. + + # Principle component analysis of the pathway output files + pca: include: false - # adds ml analysis per algorithm output - # only runs for algorithms with multiple parameter combinations chosen - aggregate_per_algorithm: true + aggregate_per_algorithm: false + pca_chosen: + include: false + aggregate_per_algorithm: false # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph labels: true + # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots. + # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file. + # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used + # to pick the 'best' parameter combination. + kde: true + # removes empty pathways from consideration in ml analysis + remove_empty_pathways: false + # Hierarchical agglomerative clustering analysis of the pathway output files + hac: + include: false + aggregate_per_algorithm: false # 'ward', 'complete', 'average', 'single' # if linkage: ward, must use metric: euclidean linkage: 'ward' # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' + # Ensembling pathway output + ensemble: + include: false + aggregate_per_algorithm: true + evaluation: + include: false + aggregate_per_algorithm: false evaluation: + # evaluation per dataset-goldstandard pair. + # This evaluation specifically generates precision-recall curves: + # to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses. include: false + aggregate_per_algorithm: false diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index 823db03bb..b1ba858ed 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -97,11 +97,31 @@ reconstruction_settings: locations: reconstruction_dir: output/egfr analysis: - cytoscape: - include: true summary: include: true - ml: + cytoscape: + include: true + pca: + include: false + aggregate_per_algorithm: false + pca_chosen: + include: false + aggregate_per_algorithm: false + components: 2 + labels: false + kde: false + remove_empty_pathways: false + hac: + include: false + aggregate_per_algorithm: false + linkage: 'ward' + metric: 'euclidean' + ensemble: include: false + aggregate_per_algorithm: false + evaluation: + include: false + aggregate_per_algorithm: false evaluation: include: false + aggregate_per_algorithm: false diff --git a/test/test_config.py b/test/test_config.py index 41551c381..574db2499 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -94,13 +94,29 @@ def get_test_config(): "summary": { "include": False }, - "ml": { + "pca": { "include": False, "aggregate_per_algorithm": False, + "pca_chosen": { + "include": False + } + }, + "hac": { + "include": False, + "aggregate_per_algorithm": False + }, + "ensemble": { + "include": False, + "evaluation": { + "include": False + } }, "cytoscape": { "include": False }, + "jaccard": { + "include": False + }, "evaluation": { "include": False, "aggregate_per_algorithm": False @@ -304,54 +320,24 @@ def test_config_values(self): MEOParams(local_search=False, max_path_length=2) ]) - @pytest.mark.parametrize("ml_include, eval_include, expected_ml, expected_eval", [ + @pytest.mark.parametrize("include, eval_include, expected_include, expected_eval", [ (True, True, True, True), (True, False, True, False), (False, True, False, False), (False, False, False, False) ]) - def test_eval_ml_coupling(self, ml_include, eval_include, expected_ml, expected_eval): - test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = ml_include - test_config["analysis"]["evaluation"]["include"] = eval_include - config.init_global(test_config) - - assert config.config.analysis_include_ml == expected_ml - assert config.config.analysis_include_evaluation == expected_eval - - @pytest.mark.parametrize("ml_include, ml_agg_include, expected_ml, expected_ml_agg", [ - (True, True, True, True), - (True, False, True, False), - (False, True, False, False), - (False, False, False, False) + @pytest.mark.parametrize("analysis_type, evaluation_type", [ + ("pca", "pca_chosen"), + ("ensemble", "evaluation") ]) - def test_ml_agg_algo_coupling(self, ml_include, ml_agg_include, expected_ml, expected_ml_agg): + def test_eval_pca_coupling(self, include, eval_include, expected_include, expected_eval, analysis_type, evaluation_type): test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = ml_include - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = ml_agg_include + test_config["analysis"][analysis_type]["include"] = include + test_config["analysis"][analysis_type][evaluation_type]["include"] = eval_include config.init_global(test_config) - assert config.config.analysis_include_ml == expected_ml - assert config.config.analysis_include_ml_aggregate_algo == expected_ml_agg - - @pytest.mark.parametrize("eval_include, agg_algo, expected_eval, expected_agg_algo", [ - (True, True, True, True), - (True, False, True, False), - (False, True, False, False), - (False, False, False, False), - ]) - def test_eval_agg_algo_coupling(self, eval_include, agg_algo, expected_eval, expected_agg_algo): - test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = True - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True - - test_config["analysis"]["evaluation"]["include"] = eval_include - test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = agg_algo - - config.init_global(test_config) - - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.analysis_include_evaluation_aggregate_algo == expected_agg_algo + assert vars(config.config.analysis)[analysis_type].include == expected_include + assert vars(vars(config.config.analysis)[analysis_type])[evaluation_type].include == expected_eval @pytest.mark.parametrize("ml_include, ml_agg, eval_include, eval_agg, expected_ml, expected_ml_agg, expected_eval, expected_eval_agg", [ (False, True, True, True, False, False, False, False), @@ -360,61 +346,24 @@ def test_eval_agg_algo_coupling(self, eval_include, agg_algo, expected_eval, exp (True, True, True, True, True, True, True, True), (True, False, False, False, True, False, False, False), ]) + @pytest.mark.parametrize("analysis_type, evaluation_type", [ + ("pca", "pca_chosen"), + ("ensemble", "evaluation") + ]) def test_eval_ml_agg_algo_coupling(self, ml_include, ml_agg, eval_include, eval_agg, expected_ml, expected_ml_agg, - expected_eval, expected_eval_agg): - # the value of ml include and ml aggregate_per_algorithm can affect the value of evaluation include and + expected_eval, expected_eval_agg, analysis_type, evaluation_type): + # the value of pca include and pca aggregate_per_algorithm can affect the value of evaluation include and # evaluation aggregate_per_algorithm test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = ml_include - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = ml_agg - test_config["analysis"]["evaluation"]["include"] = eval_include - test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = eval_agg - - config.init_global(test_config) - - assert config.config.analysis_include_ml == expected_ml - assert config.config.analysis_include_ml_aggregate_algo == expected_ml_agg - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.analysis_include_evaluation_aggregate_algo == expected_eval_agg - - @pytest.mark.parametrize("eval_include, kde, expected_eval, expected_kde", [ - (True, True, True, True), - (True, False, True, True), - (False, True, False, True), - (False, False, False, False), - ]) - def test_eval_kde_coupling(self, eval_include, kde, expected_eval, expected_kde): - test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = True - # dealing with other coupling issue - test_config["analysis"]["summary"]["include"] = True - - test_config["analysis"]["ml"]["kde"] = kde - test_config["analysis"]["evaluation"]["include"] = eval_include + test_config["analysis"][analysis_type]["include"] = ml_include + test_config["analysis"][analysis_type]["aggregate_per_algorithm"] = ml_agg + test_config["analysis"][analysis_type][evaluation_type]["include"] = eval_include + test_config["analysis"][analysis_type][evaluation_type]["aggregate_per_algorithm"] = eval_agg config.init_global(test_config) - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.pca_params["kde"] == expected_kde - - @pytest.mark.parametrize("eval_include, summary_include, expected_eval, expected_summary", [ - (True, True, True, True), - (True, False, True, True), - (False, True, False, True), - (False, False, False, False), - ]) - def test_eval_summary_coupling(self, eval_include, summary_include, expected_eval, expected_summary): - test_config = get_test_config() - # dealing with other coupling issue - test_config["analysis"]["ml"]["include"] = True - test_config["analysis"]["ml"]["kde"] = True - - test_config["analysis"]["summary"]["include"] = summary_include - test_config["analysis"]["evaluation"]["include"] = eval_include - - config.init_global(test_config) - - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.analysis_include_summary == expected_summary - + assert vars(config.config.analysis)[analysis_type].include == expected_ml, f"Include was not {expected_ml}!" + assert vars(config.config.analysis)[analysis_type].aggregate_per_algorithm == expected_ml_agg, f"Aggregate per algorithm was not {expected_ml_agg}!" + assert vars(vars(config.config.analysis)[analysis_type])[evaluation_type].include == expected_eval, f"evaluation include was not {expected_eval}!" + assert vars(vars(config.config.analysis)[analysis_type])[evaluation_type].aggregate_per_algorithm == expected_eval_agg, f"evaluation aggregate per algorithm was not {expected_eval_agg}!"