release 1.0.8

ftwkoopmans · May 24, 2024 · d1018ce · d1018ce
1 parent f1b12fa
commit d1018ce
Show file tree

Hide file tree

Showing 15 changed files with 285 additions and 84 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Package: msdap
 Title: Mass Spectrometry Downstream Analysis Pipeline
 Description: Analyze label-free proteomics datasets from various sources (MaxQuant, Spectronaut, etc) using a pipeline that facilitates peptide filtering and many algorithms for normalization and statistical analysis. A comprehensive PDF report can be generated that includes many data visualizations and documentation thereof.
 URL: https://github.com/ftwkoopmans/msdap
-Version: 1.0.7
+Version: 1.0.8
 Authors@R: 
     person(given = "Frank",
            family = "Koopmans",

diff --git a/R/dea.R b/R/dea.R
@@ -17,11 +17,13 @@
 #'
 #' **msqrob**: implementation of the MSqRob package, with minor tweak for (situationally) faster computation (PMID:26566788) <https://github.com/statOmics/msqrob>. This is a peptide-level DEA algorithm. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob}
 #'
+#' **msqrob_fc**: msqrob estimated protein pvalues, but with log2fc values computed by eBayes. This avoids the excessive shrinkage of protein log2fc values by MSqRob, but does use the MSqRob robust peptide regression model for protein p-value estimates (which outperforms eBayes/DEqMS in ROC analyses on most benchmark datasets, so this method is intended as a "best of both worlds")
+#'
 #' **msqrobsum**: implementation of the MSqRob package (which also features MSqRobSum), with minor tweak for (situationally) faster computation (PMID:32321741) <https://github.com/statOmics/msqrob>. This is a hybrid peptide&protein-level DEA algorithm that takes peptide-level data as input; it first performs peptide-to-protein rollup, then applies statistics to this protein-level data matrix. This method will take provided covariates into account (if any). Implemented in function; \code{de_msqrobsum_msqrob}
 #'
 #' @export
 dea_algorithms = function() {
-  return(c("ebayes", "deqms", "msempire", "msqrobsum", "msqrob"))
+  return(c("ebayes", "deqms", "msempire", "msqrobsum", "msqrob")) # , "msqrob_fc"
 }
 
 
@@ -261,9 +263,11 @@ dea = function(dataset, qval_signif = 0.01, fc_signif = 0, dea_algorithm = "deqm
           # ! compared to the other dea algorithms, MS-EmpiRe is not a regression model so we cannot add random variables
           alg_result = de_msempire(eset_peptides, input_intensities_are_log2 = T)
         } else if(alg == "msqrob") {
-          alg_result = de_msqrobsum_msqrob(eset_peptides, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars)
+          alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE)
+        # } else if(alg == "msqrob_fc") {
+        #   alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = eset_proteins, use_peptide_model = T, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = TRUE)
         } else if(alg == "msqrobsum") {
-          alg_result = de_msqrobsum_msqrob(eset_peptides, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars)
+          alg_result = de_msqrobsum_msqrob(eset_peptides, eset_proteins = NULL, use_peptide_model = F, input_intensities_are_log2 = T, random_variables = ranvars, log2fc_without_shrinkage = FALSE)
         } else {
           # for non-hardcoded functions, we call the function requested as a user parameter and pass all available data
           alg_fun = match.fun(alg) # throws error if not found

diff --git a/R/filter_peptides.R b/R/filter_peptides.R
@@ -58,9 +58,13 @@ cache_filtering_data = function(dataset) {
   p[, key_peptide_group_noexclude := kpg]
   # detect not as a boolean, but scaled by number of detect per sample
   # ` * !is.na(key_peptide_group_noexclude)` is simply to remove all data points for samples that are excluded, which allows us to downstream group by key_group and not worry about excluded samples
-  p[, `:=` (detect_scaled_noexclude = (detect/sum(detect)) * !is.na(key_peptide_group_noexclude),
-            quant_scaled_noexclude = 1/.N * !is.na(key_peptide_group_noexclude)),
+  p[, c("detect_scaled_noexclude", "quant_scaled_noexclude") := list(
+    (detect/sum(detect)) * !is.na(key_peptide_group_noexclude),
+    1/.N * !is.na(key_peptide_group_noexclude) ),
     by = key_sample]
+  # p[, `:=` (detect_scaled_noexclude = (detect/sum(detect)) * !is.na(key_peptide_group_noexclude),
+  #           quant_scaled_noexclude = 1/.N * !is.na(key_peptide_group_noexclude)),
+  #   by = key_sample]
 
 
   ## efficiently collapse filtering data for each peptide * 'sample group'
@@ -78,7 +82,10 @@ cache_filtering_data = function(dataset) {
   data.table::setkey(dt_pep_group, key_group) # set key for data.table::merge()
   dt_criteria = data.table::data.table(grps %>% select(key_group, size, size_noexclude) %>% mutate_all(as.integer), key="key_group")
   dt_pep_group = data.table::merge.data.table(dt_pep_group, dt_criteria, all.x = T, sort = FALSE)
-  dt_pep_group[ , `:=`(ndetect_fraction = ndetect / size, ndetect_noexclude_fraction = ndetect_noexclude / size_noexclude)]
+  dt_pep_group[ , c("ndetect_fraction", "ndetect_noexclude_fraction") := list(
+    ndetect / size,
+    ndetect_noexclude / size_noexclude)]
+  # dt_pep_group[ , `:=`(ndetect_fraction = ndetect / size, ndetect_noexclude_fraction = ndetect_noexclude / size_noexclude)]
 
 
   ################# pre-cache CoV
@@ -279,19 +286,25 @@ filter_dataset = function(dataset,
   data.table::setkey(dataset$dt_pep_group, key_group) # set key for data.table::merge()
   dt_criteria = data.table::data.table(dataset$groups %>% select(key_group, ndetect_min, nquant_min, ndetect_noexclude_min, nquant_noexclude_min) %>% mutate_all(as.integer), key="key_group")
   dt_filter_group = data.table::merge.data.table(dataset$dt_pep_group, dt_criteria, all.x = T, sort = FALSE)
-  dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min,
-                          pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)]
+  dt_filter_group[ , c("pass", "pass_noexclude") := list(
+    ndetect >= ndetect_min & nquant >= nquant_min,
+    ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min
+  )]
+  # dt_filter_group[ , `:=`(pass = ndetect >= ndetect_min & nquant >= nquant_min,
+  #                         pass_noexclude = ndetect_noexclude >= ndetect_noexclude_min & nquant_noexclude >= nquant_noexclude_min)]
 
   ## now we know for each peptide whether it passes the filter criteria in each group (with and without taking 'exclude' samples into account)
 
   ## wide format for re-use (eg; by-group or all-group)
   data.table::setkey(dt_filter_group, pass_noexclude) # set key for filter step below; remove all peptide*group that don't pass anywhere to minimize wide-table size
   dt_filter_group_wide_noexclude = data.table::dcast(dt_filter_group[pass_noexclude>0], key_peptide ~ key_group, value.var = "pass_noexclude", fill = FALSE)
-  dt_filter_group_wide_noexclude[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))]
+  dt_filter_group_wide_noexclude[ , ngroup := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))]
+  # dt_filter_group_wide_noexclude[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide_noexclude), "key_peptide")))]
   # analogous
   data.table::setkey(dt_filter_group, pass)
   dt_filter_group_wide = data.table::dcast(dt_filter_group[pass>0], key_peptide ~ key_group, value.var = "pass", fill = FALSE)
-  dt_filter_group_wide[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))]
+  dt_filter_group_wide[ , ngroup := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))]
+  # dt_filter_group_wide[ , `:=`(ngroup = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(setdiff(colnames(dt_filter_group_wide), "key_peptide")))]
 
 
   if(by_group) {
@@ -351,15 +364,17 @@ filter_dataset = function(dataset,
         grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp1_key)]] ]
       } else {
         # if multi-group, select all columns for this group and test that all are true
-        dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))]
+        dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp1_key)))]
+        # dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp1_key)))]
         grp1_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp1_key) ]
       }
 
       if(length(grp2_key) == 1) {
         grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude[[as.character(grp2_key)]] ]
       } else {
         # if multi-group, select all columns for this group and test that all are true
-        dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))]
+        dt_filter_group_wide_noexclude[, temp := matrixStats::rowSums2(as.matrix(.SD)), .SDcols=eval(quote(as.character(grp2_key)))]
+        # dt_filter_group_wide_noexclude[, `:=`(temp = matrixStats::rowSums2(as.matrix(.SD))), .SDcols=eval(quote(as.character(grp2_key)))]
         grp2_mask = dataset$peptides$key_peptide %in% dt_filter_group_wide_noexclude$key_peptide[ dt_filter_group_wide_noexclude$temp == length(grp2_key) ]
       }
 

diff --git a/R/plot_peptide_data.R b/R/plot_peptide_data.R
@@ -1,17 +1,57 @@
 
 #' plot peptide-level data
 #'
-#' @param dataset todo
-#' @param select_all_proteins todo
-#' @param select_diffdetect_candidates todo
-#' @param select_dea_signif todo
-#' @param filter_protein_ids todo
-#' @param filter_genes todo
-#' @param output_dir todo
-#' @param show_unused_datapoints todo
-#' @param norm_algorithm todo
+#' @examples \dontrun{
+#' # example 1:
+#' # <assuming you imported a dataset and applied analysis_quickstart()>
+#' # plot all significant proteins found by DEA
+#' plot_peptide_data( dataset, select_dea_signif = TRUE,
+#'   # should match analysis_quickstart() parameters !
+#'   norm_algorithm = c("vwmb", "modebetween_protein"),
+#'   show_unused_datapoints = TRUE, output_dir = "C:/temp")
+#'
+#' # example 2:
+#' # filter the DEA results for some specific proteins, then plot these
+#' pid = dataset$de_proteins %>%
+#'   filter(dea_algorithm == "deqms" & qvalue < 10^-4) %>%
+#'   pull(protein_id)
+#' plot_peptide_data(
+#'   dataset, filter_protein_ids = pid,
+#'   # should match analysis_quickstart() parameters !
+#'   norm_algorithm = c("vwmb", "modebetween_protein"),
+#'   show_unused_datapoints = TRUE, output_dir = "C:/temp")
+#'
+#' # example 3:
+#' # plot specific proteins by providing gene symbols (not case sensitive)
+#' plot_peptide_data(
+#'   dataset, filter_genes = c("shisa6", "gria2"),
+#'   # should match analysis_quickstart() parameters !
+#'   norm_algorithm = c("vwmb", "modebetween_protein"),
+#'   show_unused_datapoints = TRUE, output_dir = "C:/temp")
+#' }
+#' @param dataset your dataset
+#' @param select_all_proteins plot all proteins. This will take a long time. Default: FALSE
+#' @param select_diffdetect_candidates if "differential detection" was enabled (it is by default in `analysis_quickstart`), plot candidate proteins based on their zscores.
+#' Valid parameters are positive numeric values that are to be used as absolute z-score cutoffs (4 or 5 are typically reasonable; use `plot_differential_detect` to visualize the score distributions).
+#' This selects all proteins that meet this criterium in any contrast, and plots these proteins in every contrast (include those where it is has a z-score below given threshold).
+#' Alternative, set to TRUE to use an absolute z-score cutoff value of 4. Default: NA (disable)
+#' @param select_dea_signif plot all significant proteins from DEA?
+#' This effectively checks the "signif" column in the `dataset$de_proteins` data table (DEA results generated by MS-DAP), finds all proteins that are signif in any contrast, and plots these proteins in every contrast (include those where it is not signif).
+#' Boolean parameter, default: FALSE
+#' @param filter_protein_ids a vector of protein_id values that should be plotted. For example, you can find specific protein_id values in the differential expression result table and provide only these as a parameter
+#' (in this case, set `select_dea_signif=FALSE` to ensure only your specific protein identifiers are plotted)
+#' @param filter_genes a vector of gene symbols to plot (alternative for filter_protein_ids)
+#' @param output_dir full path to the output directory
+#' @param show_unused_datapoints should data points be shown for peptides that were not used in the differential expression analyses? i.e. peptides that did not pass the filtering prior to DEA (found in N samples in both experimental conditions). Boolean value, default: FALSE
+#' @param norm_algorithm normalization algorithms. This should be exactly the same as the parameter your provided to `analysis_quickstart`
 #' @export
-plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = TRUE, select_dea_signif = TRUE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) {
+plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffdetect_candidates = NA, select_dea_signif = FALSE, filter_protein_ids = NA, filter_genes = NA, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA) {
+  if(length(select_diffdetect_candidates) == 1 && select_diffdetect_candidates %in% TRUE) {
+    select_diffdetect_candidates = 4 # default z-score
+  }
+  if(length(select_diffdetect_candidates) != 1 || !is.numeric(select_diffdetect_candidates) || select_diffdetect_candidates <= 0) {
+    select_diffdetect_candidates = NA
+  }
   pid = NULL
 
   ## if requested to plot all proteins, just take the unique set from peptide tibble and don't bother with other selection parameters
@@ -40,11 +80,11 @@ plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffde
       pid = c(pid, dataset$de_proteins %>% filter(signif) %>% distinct(protein_id) %>% pull())
     }
 
-    if(select_diffdetect_candidates) {
+    if(!is.na(select_diffdetect_candidates)) {
       if(!"dd_proteins" %in% names(dataset)) {
         append_log("you requested to plot proteins tested by differential detection counting, but no differential detection has not been applied to this dataset yet (did you run the analysis_quickstart() function yet?  Alternatively you can simply run differential_detect() and review its outcome with print_dataset_summary() to quickly prepare this selection)", type = "error")
       }
-      pid = c(pid, dataset$dd_proteins %>% filter(type == "detect" & is.finite(zscore) & abs(zscore) >= 4) %>% distinct(protein_id) %>% pull())
+      pid = c(pid, dataset$dd_proteins %>% filter(type == "detect" & is.finite(zscore) & abs(zscore) >= select_diffdetect_candidates) %>% distinct(protein_id) %>% pull())
     }
   }
 
@@ -71,7 +111,7 @@ plot_peptide_data = function(dataset, select_all_proteins = FALSE, select_diffde
 #' @importFrom stringr str_trunc
 #' @importFrom foreach foreach
 #' @importFrom parallel stopCluster clusterExport
-plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, output_dir, show_unused_datapoints = FALSE, norm_algorithm=NA, rollup_algorithm = "maxlfq") {
+plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, output_dir, show_unused_datapoints = FALSE, norm_algorithm = NA, rollup_algorithm = "maxlfq") {
   filter_protein_id = unique(na.omit(filter_protein_id))
   if(length(filter_protein_id) == 0) {
     append_log("no valid protein identifiers were provided", type = "error")
@@ -203,22 +243,23 @@ plot_peptide_data_by_contrast_to_pdf = function(dataset, filter_protein_id, outp
 
     if("dd_proteins" %in% names(dataset)) {
       tib_dd = dataset$dd_proteins %>% filter(protein_id %in% proteins_contr$protein_id & contrast == contr & is.finite(zscore))
+      if(nrow(tib_dd) > 0) {
+        # infer order in which to plot proteins
+        contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id))
 
-      # infer order in which to plot proteins
-      contr_protein_id__ordered = c(contr_protein_id__ordered, tib_dd %>% arrange(desc(abs(zscore))) %>% pull(protein_id))
-
-      # gather differential detect scores in expected format
-      tib_dd_long = tib_dd %>% select(protein_id, foldchange = zscore) %>% mutate(algorithm = paste0("zscore", type, "counts"))
+        # gather differential detect scores in expected format
+        tib_dd_long = tib_dd %>% select(protein_id, type, foldchange = zscore) %>% mutate(algorithm = paste0("zscore-", type, "-counts"))
 
-      # pretty-print format for downstream visualization
-      tib_dd_long = tib_dd_long %>%
-        mutate(foldchange = sprintf("%.2f", foldchange),
-               pvalue="",
-               qvalue="",
-               signif="") %>%
-        select(protein_id, algorithm, foldchange, pvalue, qvalue, signif)
-
-      de_proteins_contr = bind_rows(de_proteins_contr, tib_dd_long)
+        # pretty-print format for downstream visualization
+        tib_dd_long = tib_dd_long %>%
+          mutate(foldchange = sprintf("%.2f", foldchange),
+                 pvalue="",
+                 qvalue="",
+                 signif="") %>%
+          select(protein_id, algorithm, foldchange, pvalue, qvalue, signif)
+
+        de_proteins_contr = bind_rows(de_proteins_contr, tib_dd_long)
+      }
     }
 
 
@@ -393,7 +434,7 @@ reintroduce_filtered_intensities = function(dataset, norm_algorithm, rollup_algo
         # x_ref[rows_sid] = intensity values in sample s for only the subset of peptides that pass user's filtering criteria, that were subsequently normalized
         # if we want to reintroduce intensity values from the unfiltered dataset, where normalization was applied to the total set of peptides, there has to be some variation between the filtered and unfiltered data !
         # so we here scale the abundances of the 'unfiltered data' to the 'filtered data'
-        x[rows_sid] = x[rows_sid] + get_mode(x[rows_sid] - x_ref[rows_sid])
+        x[rows_sid] = x[rows_sid] + suppressWarnings(get_mode(x[rows_sid] - x_ref[rows_sid]))
       }
     }