release 1.2.1

ftwkoopmans · Oct 6, 2024 · fa8e9cb · fa8e9cb
1 parent 42b6448
commit fa8e9cb
Show file tree

Hide file tree

Showing 19 changed files with 335 additions and 188 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.2
+Version: 1.2.1
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",

diff --git a/NAMESPACE b/NAMESPACE
@@ -59,6 +59,7 @@ export(limma_wrapper)
 export(logger.default)
 export(matrix_to_long)
 export(merge_fractionated_samples)
+export(merge_proteingroups_by_gene)
 export(merge_replicate_samples)
 export(mgi_lookuptable)
 export(minlog10)

diff --git a/R/dea.R b/R/dea.R
@@ -210,6 +210,10 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm
     peptides_for_contrast = dataset$peptides %>%
       select(sample_id, protein_id, peptide_id, sequence_plain, sequence_modified, detect, intensity=!!as.character(col_contr_intensity), any_of(c("confidence","charge"))) %>%
       filter(sample_id %in% samples_for_contrast$sample_id & is.finite(intensity))
+    # if(!all(samples_for_contrast$sample_id %in% unique(peptides_for_contrast$sample_id)) ) {
+    #   append_log("some samples that are defined in the current contrast do not have any datapoints in the dataset$peptides table (at currently selected 'peptide filter') column", type = "error")
+    # }
+
     # peptide ExpressionSet
     eset_peptides = tibble_as_eset(peptides_for_contrast, dataset$proteins, samples_for_contrast)
     # rollup peptide abundance matrix to protein-level

diff --git a/R/export_stats_genesummary.R b/R/export_stats_genesummary.R
@@ -215,9 +215,70 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
   )
 
 
+  # the allstats table is already sorted by "best hits on top" (last few lines @ summarise_stats() function; `arrange(desc(signif), pvalue)` )
+  result = disambiguate_protein_table_by_gene(protein_data = allstats, hgnc = hgnc, gene_ambiguity = gene_ambiguity, xref = xref, distinct_factors = c("contrast", "dea_algorithm"), remove_nohgnc = remove_nohgnc) %>%
+    arrange(desc(signif), pvalue, gene_symbols_or_id)
+
+
+  ### print output tables
+  if(is.na(output_dir)) {
+    append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info")
+  } else {
+    for(contr in unique(result$contrast)) {
+      for(algo_dea in unique(result$dea_algorithm)) {
+        tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea)
+        f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea)
+        openxlsx::write.xlsx(tmp, f)
+      }
+    }
+  }
+
+  return(invisible(result))
+}
+
+
+
+#' given some table with protein-level data, map to HGNC genes and deal with redundant or ambiguous gene-level results
+#'
+#' @description
+#' The order of the data table is pivotal: the first row for each unique gene (criteria in parameter `gene_ambiguity`)) is selected !
+#'
+#' @param protein_data a data.frame with columns (from dataset$proteins) protein_id, gene_symbols, gene_symbols_or_id.
+#' Importantly, if you opt to return only unique entries per gene then the row of the FIRST matching protein_id is
+#' returned, so sort your data table by pvalue upstream !
+#' @param hgnc see `export_stats_genesummary()`
+#' @param gene_ambiguity see `export_stats_genesummary()`
+#' @param xref see `export_stats_genesummary()`
+#' @param remove_nohgnc see `export_stats_genesummary()`
+#' @param distinct_factors a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)
+disambiguate_protein_table_by_gene = function(protein_data, hgnc, gene_ambiguity, xref = NULL, remove_nohgnc = FALSE, distinct_factors = NULL) {
+
+  ### input validation
+  if(!(is.data.frame(hgnc) && all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)))) {
+    append_log("hgnc parameter should be a data.frame with columns hgnc_id, hgnc_symbol, type, value (output from hgnc_lookuptable() function)", type = "error")
+  }
+  if(!(length(gene_ambiguity) == 1 && all(gene_ambiguity %in% c("leading_gene", "prio_specific", "only_specific")))) {
+    append_log("gene_ambiguity parameter should be 1 of the following options: leading_gene, prio_specific, only_specific", type = "error")
+  }
+  if(!is.null(xref) && !(is.data.frame(xref) && nrow(xref) > 0 && ncol(xref) >= 3 && all(c("symbol", "uniprot_id") %in% colnames(xref)))) {
+    append_log("xref parameter should be a data.frame with columns mgi_id/rgd_id, symbol, uniprot_id (output from mgi_lookuptable() or rgd_lookuptable() function)", type = "error")
+  }
+  if(!is.null(xref) && !colnames(xref)[1] %in% c("mgi_id", "rgd_id")) {
+    append_log("xref parameter only works with MGI and RGD atm (output from mgi_lookuptable() or rgd_lookuptable() function): first column must be mgi_id or rgd_id", type = "error")
+  }
+  if(!(length(remove_nohgnc) == 1 && remove_nohgnc %in% c(TRUE, FALSE))) {
+    append_log("remove_nohgnc parameter should be TRUE or FALSE", type = "error")
+  }
+  if(!(is.data.frame(protein_data) && all(c("protein_id", "gene_symbols", "gene_symbols_or_id") %in% colnames(protein_data)))) {
+    append_log("protein_data parameter should be a data.frame with columns protein_id, gene_symbols and gene_symbols_or_id", type = "error")
+  }
+  if(!(is.null(distinct_factors) || (is.character(distinct_factors) & all(distinct_factors %in% colnames(protein_data))))) {
+    append_log("distinct_factors parameter should be NULL or a set of columns in protein_data that should be considered (together with protein_id) factors that describe subsets of data (to return unique gene-level data within)", type = "error")
+  }
+
 
   ### create long-format table of uniprot_id*symbol for each unique proteingroup
-  proteins = allstats %>%
+  proteins = protein_data %>%
     select(protein_id, gene_symbols) %>%
     distinct(protein_id, .keep_all = TRUE) %>%
     mutate(uniprot_id = strsplit(protein_id, "[ ,;]+"),
@@ -242,24 +303,24 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
 
 
 
-  ### now join the unique proteingroup-to-HGNC mappings with table `allstats`
+  ### now join the unique proteingroup-to-HGNC mappings with table `protein_data`
   # hgnc_list = a list column where each protein_id has an array of HGNC identifiers (length > 0), NA where missing/failed
-  allstats = allstats %>%
+  protein_data = protein_data %>%
     left_join(proteins_long %>% select(protein_id, hgnc_list = hgnc_id) %>% tidyr::chop(cols = "hgnc_list"), by = "protein_id")
 
 
 
   ### apply mapping rules
-  ### importantly, sorting of `allstats` table prior to running below code MUST be ascending by pvalue
+  ### importantly, sorting of `protein_data` table prior to running below code MUST be ascending by pvalue
 
-  allstats$hgnc_list_unique_notna = lapply(allstats$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code
-  # dev note: don't use `allstats %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1
+  protein_data$hgnc_list_unique_notna = lapply(protein_data$hgnc_list, function(g) { unique(na.omit(g)) }) # naive/slow, but prefer simple code
+  # dev note: don't use `protein_data %>% filter(lengths(hgnc_list) == 1)` because a proteingroup can contain multiple uniprot identifiers that all map to the same gene --> hgnc_list will not be length 1
 
-  allstats$isambiguous = grepl(";", allstats$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect)
-  #  allstats$isambiguous = lengths(allstats$hgnc_list_unique_notna) > 1  # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna
+  protein_data$isambiguous = grepl(";", protein_data$gene_symbols_or_id, fixed = TRUE) # infer ambiguous directly from uniprot gene symbols (i.e. failed mapping to HGNC have no effect)
+  #  protein_data$isambiguous = lengths(protein_data$hgnc_list_unique_notna) > 1  # alternatively, determine "ambiguous" from successful HGNC mappings @ hgnc_list_unique_notna
 
   # take the first non-NA HGNC identifier (NA if none)
-  allstats$hgnc_id = unlist(lapply(allstats$hgnc_list_unique_notna, function(g) {
+  protein_data$hgnc_id = unlist(lapply(protein_data$hgnc_list_unique_notna, function(g) {
     # always return exactly 1 value, thus downstream unlist() will yield an array that exactly matches table length
     if(length(g) == 0) return(NA_character_)
     return(g[1])
@@ -268,34 +329,33 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
   if(gene_ambiguity == "only_specific") {
     # invalidate all entries that are ambiguous
     # importantly, do this before computing `hgnc_id_FORUNIQUE`
-    allstats$hgnc_id[allstats$isambiguous] = NA_character_
+    protein_data$hgnc_id[protein_data$isambiguous] = NA_character_
   }
 
   if(gene_ambiguity == "prio_specific") {
     # sort table such that ambiguous hits are at the bottom; importantly, ties-of-ambiguity are still sorted as-is
     # taking first match per gene downstream = prio specific/unambiguous over ambiguous proteingroup that has a "better pvalue"
-    allstats = allstats %>% arrange(isambiguous)
+    protein_data = protein_data %>% arrange(isambiguous)
   }
 
   # ! importantly, add a unique ID to proteingroups not mapped to HGNC so we can use distinct() downstream without grouping NA values (failed mappings)
-  allstats$hgnc_id_FORUNIQUE = ifelse(is.na(allstats$hgnc_id), allstats$gene_symbols_or_id, allstats$hgnc_id)
+  protein_data$hgnc_id_FORUNIQUE = ifelse(is.na(protein_data$hgnc_id), protein_data$gene_symbols_or_id, protein_data$hgnc_id)
   # result table: retain unique rows per gene*contrast*dea_algorithm (assuming table has been sorted upstream)
-  result = allstats %>% distinct(hgnc_id_FORUNIQUE, contrast, dea_algorithm, .keep_all = TRUE)
+  distinct_factors = unique(c("hgnc_id_FORUNIQUE", distinct_factors))
+  result = protein_data %>% distinct(dplyr::across(tidyselect::all_of(distinct_factors)), .keep_all = TRUE)
 
 
 
-  ### finally, stable sort result table + add gene database crossreferences
-  # sort table again ("prio_specific" option affects sorting, so here enforce stable ordering)
+  ### add gene database crossreferences
   result = result %>%
-    arrange(desc(signif), pvalue) %>%
     select(-tidyselect::any_of(c("gene_symbols", "symbol", "hgnc_list", "hgnc_list_unique_notna", "hgnc_id_FORUNIQUE", "isambiguous"))) %>%
     hgnc_add_xrefs(hgnc = hgnc)
   # add named column specifically for GOAT
   result$gene = result$entrez_id
 
 
 
-  tmp = allstats %>% distinct(protein_id, .keep_all = TRUE)
+  tmp = protein_data %>% distinct(protein_id, .keep_all = TRUE)
   log_pg_ambiguous = sum(tmp$isambiguous)
 
   if(gene_ambiguity == "only_specific") {
@@ -332,20 +392,5 @@ export_stats_genesummary = function(dataset, gene_ambiguity = "prio_specific", d
     result = result %>% filter(!is.na(hgnc_id))
   }
 
-
-
-  ### print output tables
-  if(is.na(output_dir)) {
-    append_log("output_dir parameter set to NA, skip writing Excel output tables", type = "info")
-  } else {
-    for(contr in unique(result$contrast)) {
-      for(algo_dea in unique(result$dea_algorithm)) {
-        tmp = result %>% filter(contrast == contr & dea_algorithm == algo_dea)
-        f = sprintf("%s/%s__%s.xlsx", output_dir, filename_strip_illegal_characters(sub("contrast: *", "", contr), strict = TRUE), algo_dea)
-        openxlsx::write.xlsx(tmp, f)
-      }
-    }
-  }
-
-  return(invisible(result))
+  return(result)
 }
diff --git a/R/gene_idmapping.R b/R/gene_idmapping.R
@@ -242,7 +242,7 @@ rgd_lookuptable = function(f) {
 #' helper function to match a protein table to HGNC via gene symbols (and synonyms)
 #'
 #' @description typically called from `export_stats_genesummary`
-#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)
+#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimited values)
 #' @param hgnc HGNC lookup table from `hgnc_lookuptable()`
 #' @param use_synonyms boolean; use synonyms? if FALSE, only uses hgnc_symbol
 idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) {
@@ -290,13 +290,13 @@ idmap_symbols = function(proteins_long, hgnc, use_synonyms = TRUE) {
 #' helper function to match a protein table to HGNC via an intermediate mapping table (e.g. MGI/RGD)
 #'
 #' @description typically called from `export_stats_genesummary`
-#' @param proteins_long long-format data.frame with columns protein_id and symbol pairs (i.e. no semicolon-delimted values)
+#' @param proteins_long long-format data.frame with columns uniprot_id and symbol pairs (i.e. no semicolon-delimited values)
 #' @param hgnc HGNC lookup table from `hgnc_lookuptable()`
 #' @param idmap MGI or RGD lookup table from `mgi_lookuptable()` or `rgd_lookuptable()`
 #' @param use_symbols boolean; use symbols to map from proteins_long to the idmap table? if FALSE, only matches by uniprot identifier
 idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) {
   if(!is.data.frame(proteins_long) || nrow(proteins_long) == 0 || !all(c("uniprot_id", "symbol") %in% colnames(proteins_long)) ) {
-    append_log("proteins_long parameter must be a data.frame with columns 'protein_id' and 'symbol'", type = "error")
+    append_log("proteins_long parameter must be a data.frame with columns 'uniprot_id' and 'symbol'", type = "error")
   }
   if(!is.data.frame(hgnc) || nrow(hgnc) == 0 || !all(c("hgnc_id", "hgnc_symbol", "type", "value") %in% colnames(hgnc)) ) {
     append_log("hgnc parameter must be a data.frame with columns 'hgnc_id', 'hgnc_symbol', 'type', 'value' as typically prepared using the hgnc_lookuptable() function", type = "error")
@@ -329,10 +329,12 @@ idmap_uniprotid = function(proteins_long, hgnc, idmap, use_symbols = TRUE) {
   proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(proteins_long$uniprot_id[rows], idmap$uniprot_id)]
 
   # fallback to symbol matching (no `xref_id` yet)
-  rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & #
-    # double-check that we don't even attempt to match missing gene symbols
-    !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2
-  proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))]
+  if(use_symbols) {
+    rows = is.na(proteins_long$hgnc_id) & is.na(proteins_long$xref_id) & #
+      # double-check that we don't even attempt to match missing gene symbols
+      !is.na(proteins_long$symbol) & nchar(proteins_long$symbol) >= 2
+    proteins_long$xref_id[rows] = idmap$XREF_ID_VALUES__TEMPCOL[match(toupper(proteins_long$symbol[rows]), toupper(idmap$symbol))]
+  }
 
 
   ### map to HGNC using crossreference identifiers